Prediction details

: Taxonomy: prokaryotic / eukaryotic (default: p)\n--cores : Number of parallel CPUs to use for threading (default: all)\n--clusterblast : Include ClusterBlast gene cluster comparison analysis (default:y)\n--smcogs : Include smCOG analysis for functional prediction of genes (default:y)\n--fullblast : Include genome-wide BLAST analysis (default:n)\n--fullhmm : Include genome-wide PFAM HMM analysis (default:n)\n--blastdbpath : Specify folder containing CLUSEAN blast database (default:clusean/db)\n--pfamdbpath : Specify folder containing PFAM database (default:clusean/db)\n--geneclustertypes : Gene cluster types to scan for (default:1):\n1 = all\n2 = type I polyketide synthases\n3 = type II polyketide synthases\n4 = type III polyketide synthases\n5 = nonribosomal peptide synthetases\n6 = terpene synthases\n7 = lantibiotics\n8 = bacteriocins\n9 = beta-lactams\n10 = aminoglycosides / aminocyclitols\n11 = aminocoumarins\n12 = siderophores\n13 = ectoines\n14 = butyrolactones\n15 = indoles\n16 = nucleosides\n17 = phosphoglycolipids\n18 = melanins\n19 = others\n--help : this help screen\n" + #Check input file format + if len(sys.argv) < 2 or len(sys.argv[1]) < 1: + print >> sys.stderr, "Please supply valid name for input file." + print "Usage: antismash [options]" + print "From the command line, input antismash --help for more information." + logfile.write("Input format error. Please supply valid name for infile.\n") + logfile.write("Usage: antismash [options]\n") + logfile.write("From the command line, input antismash --help for more information.\n") + logfile.close() + sys.exit(1) + if sys.argv[1] != "--help": + if len(sys.argv[1].split(".")) < 2 or (sys.argv[1].split(".")[-1] != "embl" and sys.argv[1].split(".")[-1] != "EMBL" and sys.argv[1].split(".")[-1] != "emb" and sys.argv[1].split(".")[-1] != "EMB" and sys.argv[1].split(".")[-1] != "genbank" and sys.argv[1].split(".")[-1] != "GENBANK" and sys.argv[1].split(".")[-1] != "gbk" and sys.argv[1].split(".")[-1] != "GBK" and sys.argv[1].split(".")[-1] != "gb" and sys.argv[1].split(".")[-1] != "GB" and sys.argv[1].split(".")[-1] != "fasta" and sys.argv[1].split(".")[-1] != "FASTA" and sys.argv[1].split(".")[-1] != "fas" and sys.argv[1].split(".")[-1] != "FAS" and sys.argv[1].split(".")[-1] != "fa" and sys.argv[1].split(".")[-1] != "FA"): + print >> sys.stderr, "No EMBL/GBK/FASTA file submitted as input. Please supply a valid file with .embl / .gbk / .fasta extension. " + print "Usage: antismash [options]" + print "From the command line, input antismash --help for more information." + logfile.write("Input format error. Please supply a valid file with .embl / .gbk / .fasta extension.\n") + logfile.write("Usage: antismash [options]\n") + logfile.write("From the command line, input antismash --help for more information.\n") + logfile.close() + sys.exit(1) + #Define input filename and make fixes if necessary + infile = sys.argv[1] + try: + testfile = open(infile,"r").read() + except(IOError): + print >> sys.stderr, "Please supply valid name for input file." + print "Usage: antismash [options]" + print "From the command line, input antismash --help for more information." + logfile = open("antismash.log","w") + logfile.write("Input format error. Please supply valid name for infile.\n") + logfile.write("Usage: antismash [options]\n") + logfile.write("From the command line, input antismash --help for more information.\n") + logfile.close() + sys.exit(1) + #Parse absolute paths if found + absolutepath = "n" + if "/" in infile or "\\" in infile: + absolutepath = "y" + lastpos1 = infile.rfind("\\") + lastpos2 = infile.rfind("/") + lastpos = max([lastpos1,lastpos2]) + originpath = infile[:(lastpos + 1)] + infile = infile[(lastpos + 1):] + if sys.platform == ('win32'): + copycommand = 'copy/y "' + originpath + infile + '" ' + infile + ' > nul' + os.system(copycommand) + if sys.platform == ('linux2'): + copycommand = 'cp ' + originpath + infile + " . > /dev/null" + os.system(copycommand) + #genomename = ".".join(infile.split(".")[:-1]) + #for i in genomename: + # if i in '!"#$%&()*+,./:;=>?@[]^`{|}' or i in "'": + # genomename = genomename.replace(i,"") + # if "/" in genomename: + # genomename = genomename.rpartition("/")[2] + # if "\\" in genomename: + # genomename = genomename.rpartition("\\")[2] + genomename = os.path.splitext(os.path.basename(infile))[0] + if sys.platform == ('linux2'): + if genomename != infile.split(".")[-2]: + oldinfile = infile.replace("(","\$").replace(")","\$").replace("*","\\*").replace("&","\\&").replace("!","\\!").replace("$","\\$").replace("{","\\{").replace("}","\\}").replace("|","\\|").replace("`","\\`").replace("'","\\'").replace('"','\\"').replace('?','\\?') + infile = genomename + "." + infile.split(".")[-1] + if "/" in genomename: + genomename = genomename.rpartition("/")[2] + if "\\" in genomename: + genomename = genomename.rpartition("\\")[2] + os.system("cp " + oldinfile + " " + infile) + #Define outputfolder + if absolutepath == "y": + if sys.platform == ('win32'): + dir1 = os.popen("dir/w/ad " + originpath) + dir2 = os.popen("dir/w/ad") + dir1 = dir1.read() + dir2 = dir2.read() + if sys.platform == ('linux2'): + dir1 = os.popen("ls") + dir2 = os.popen("ls " + originpath) + dir1 = dir1.read() + dir2 = dir2.read() + parts = dir1.split(" ") + dir2.split(" ") + else: + if sys.platform == ('win32'): + dir = os.popen("dir/w/ad") + dir = dir.read() + if sys.platform == ('linux2'): + dir = os.popen("ls") + dir = dir.read() + parts = dir.split(" ") + parts2 = [] + for i in parts: + partparts = i.split("\n") + for i in partparts: + i = i.replace("[","") + i = i.replace("]","") + parts2.append(i) + parts = parts2 + oldgenomename = genomename + if genomename in parts: + genomename = genomename + "_" + str(0) + while genomename in parts: + finalpart = genomename.split("_")[-1] + allnumbers = "y" + for i in finalpart: + if i not in ["0","1","2","3","4","5","6","7","8","9"]: + allnumbers = "n" + if allnumbers == "y" and int(finalpart) in range(0,1000): + newgenomename = "" + for i in genomename.split("_")[:-1]: + newgenomename = newgenomename + "_" + i + newgenomename = newgenomename + "_" + str(int(finalpart) + 1) + genomename = newgenomename[1:] + genomename = genomename.replace("__","_") + #Output results folder name for output checking by GUI + resultslocfile = open("resultsfolder.txt","w") + resultslocfile.write(os.getcwd() + os.sep + genomename) + resultslocfile.close() + #Implement defaults + glimmertransl_table = str(1) + genomeconf = "l" + minglength = str(90) + cores = "all" + taxon = "p" + clusterblast = "y" + smcogs = "y" + fullblast = "n" + fullhmm = "n" + if sys.platform == ('win32'): + blastdbpath = '"' + os.getcwd() + "/clusean/db" + '"' + if sys.platform == ('linux2'): + blastdbpath = os.getcwd() + "/clusean/db" + if sys.platform == ('win32'): + pfamdbpath = '"' + os.getcwd() + "/clusean/db/" + '"' + if sys.platform == ('linux2'): + pfamdbpath = os.getcwd() + "/clusean/db/" + geneclustertypes = [1] + #Read user-specified options which may override defaults + if len(sys.argv) > 2 or sys.argv[1] == "--help": + options = sys.argv + if "--" in options[-1] and sys.argv[1] != "--help": + invalidoptions(options[-1]) + #identify option identifiers + identifiers = [] + for i in options: + if "--" in i: + if i not in identifiers: + identifiers.append(i) + else: + invalidoptions("No '--' in given options or option given twice.") + for i in identifiers: + if i != "--help": + value = options[options.index(i) + 1].strip() + if i == "--gtransl": + for k in value: + if k not in ["0","1","2","3","4","5","6","7","8","9"]: + invalidoptions(i + "input is no number") + if int(value) in range(1,24) and int(value) != 7 and int(value) != 8 and int(value) != 17 and int(value) != 18 and int(value) != 19 and int(value) != 20: + glimmertransl_table = value + else: + invalidoptions(i) + elif i == "--genomeconf": + if value == "l" or value == "c": + genomeconf = value + else: + invalidoptions(i) + elif i == "--minglength": + for k in value: + if k not in ["0","1","2","3","4","5","6","7","8","9"]: + invalidoptions(i) + if int(value) in range(30,91): + minglength = value + else: + print >> sys.stderr, "Invalid options input: minimal gene length should be a number between 30-90." + logfile = open("antismash.log","w") + logfile.write("Invalid options input: minimal gene length should be a number between 30-90.\n") + logfile.close() + sys.exit(1) + elif i == "--cores": + for k in value: + if k not in ["0","1","2","3","4","5","6","7","8","9"]: + invalidoptions(i) + if int(value) in range(1,1000): + cores = int(value) + else: + invalidoptions(i) + elif i == "--taxon": + if value == "p" or value == "e": + taxon = value + else: + invalidoptions(i) + elif i == "--clusterblast": + if value == "y" or value == "n": + clusterblast = value + else: + invalidoptions(i) + elif i == "--smcogs": + if value == "y" or value == "n": + smcogs = value + else: + invalidoptions(i) + elif i == "--fullblast": + if value == "y" or value == "n": + fullblast = value + else: + invalidoptions(i) + elif i == "--fullhmm": + if value == "y" or value == "n": + fullhmm = value + else: + invalidoptions(i) + elif i == "--glimmer_prediction": + glimmer_prediction_path = value + elif i == "--blastdbpath": + if sys.platform == ('win32'): + if options[options.index(i) + 1][0] != '"': + value = '"' + options[options.index(i) + 1] + '"' + else: + value = options[options.index(i) + 1] + if ":\\" in value: + blastdbpath = value + elif "\\" in value or "/" in value: + if value[0] == "\\" or value[0] == "/": + blastdbpath = os.getcwd() + value + else: + blastdbpath = os.getcwd() + "\\" + value + else: + blastdbpath = os.getcwd() + "\\" + value + if sys.platform == ('linux2'): + value = options[options.index(i) + 1] + if "\\" in value or "/" in value: + value = value.replace("\\","/") + if value[0] == "/": + blastdbpath = value + else: + blastdbpath = os.getcwd() + "/" + value + else: + blastdbpath = os.getcwd() + "/" + value + elif i == "--pfamdbpath": + if sys.platform == ('win32'): + if options[options.index(i) + 1][0] != '"': + value = '"' + options[options.index(i) + 1] + '"' + else: + value = options[options.index(i) + 1] + if ":\\" in value: + pfamdbpath = value + elif "\\" in value or "/" in value: + if value[0] == "\\" or value[0] == "/": + pfamdbpath = os.getcwd() + value + else: + pfamdbpath = os.getcwd() + "\\" + value + else: + pfamdbpath = os.getcwd() + "\\" + value + if sys.platform == ('linux2'): + value = options[options.index(i) + 1] + if "\\" in value or "/" in value: + value = value.replace("\\","/") + if value[0] == "/": + pfamdbpath = value + else: + pfamdbpath = os.getcwd() + "/" + value + else: + pfamdbpath = os.getcwd() + "/" + value + elif i == "--geneclustertypes": + if "," not in value and value not in ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19"]: + invalidoptions(i) + else: + types = value.split(",") + types2 = [] + if "1" in types: + types2 = [1] + for j in types: + if int(j) not in range(1,20): + invalidoptions(i) + else: + types2.append(int(j)) + geneclustertypes = types2 + elif i == "--help": + print inputinstructions + sys.exit() + elif i == "--gui": + pass + else: + invalidoptions(i) + + #Determine number of CPUs used + if cores == "all": + try: + nrcpus = multiprocessing.cpu_count() + except(IOError,OSError,NotImplementedError): + nrcpus = 1 + else: + try: + nrcpus = multiprocessing.cpu_count() + except(IOError,OSError,NotImplementedError): + nrcpus = 1 + if cores < nrcpus: + nrcpus = cores + + #Create directory structure needed for file storage + try: + os.mkdir(genomename) + except(IOError,OSError): + pass + hmmoutputfolder = genomename + "/hmmoutput/" + try: + os.mkdir(hmmoutputfolder) + except(IOError,OSError): + pass + nrpspksoutputfolder = genomename + "/nrpspks/" + try: + os.mkdir(nrpspksoutputfolder) + except(IOError,OSError): + pass + nrpspredictoroutputfolder = nrpspksoutputfolder + "nrpspredictor/" + try: + os.mkdir(nrpspredictoroutputfolder) + except(IOError,OSError): + pass + minowanrpsoutputfolder = nrpspksoutputfolder + "minowanrpspred/" + try: + os.mkdir(minowanrpsoutputfolder) + except(IOError,OSError): + pass + minowapksoutputfolder = nrpspksoutputfolder + "minowapkspred/" + try: + os.mkdir(minowapksoutputfolder) + except(IOError,OSError): + pass + minowacaloutputfolder = nrpspksoutputfolder + "minowacalpred/" + try: + os.mkdir(minowacaloutputfolder) + except(IOError,OSError): + pass + pkssignatureoutputfolder = nrpspksoutputfolder + "pkssignatures/" + try: + os.mkdir(pkssignatureoutputfolder) + except(IOError,OSError): + pass + kranalysisoutputfolder = nrpspksoutputfolder + "kr_analysis/" + try: + os.mkdir(kranalysisoutputfolder) + except(IOError,OSError): + pass + clusterblastoutputfolder = genomename + "/clusterblast/" + try: + os.mkdir(clusterblastoutputfolder) + except(IOError,OSError): + pass + smcogsoutputfolder = genomename + "/smcogs/" + try: + os.mkdir(smcogsoutputfolder) + except(IOError,OSError): + pass + substrspecsfolder = genomename + "/substrspecs/" + try: + os.mkdir(substrspecsfolder) + except(IOError,OSError): + pass + structuresfolder = genomename + "/structures/" + try: + os.mkdir(structuresfolder) + except(IOError,OSError): + pass + svgfolder = genomename + "/svg/" + try: + os.mkdir(svgfolder) + except(IOError,OSError): + pass + searchgtrfolder = genomename + "/searchgtr/" + try: + os.mkdir(searchgtrfolder) + except(IOError,OSError): + pass + htmlfolder = genomename + "/html/" + try: + os.mkdir(htmlfolder) + except(IOError,OSError): + pass + imagesfolder = genomename + "/images/" + try: + os.mkdir(imagesfolder) + except(IOError,OSError): + pass + + #If input is unannotated GBK/EMBL file, convert to FASTA and use that as input + if " CDS " not in open(infile,"r").read() and "FT CDS " not in open(infile,"r").read(): + if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB": + filetext = open(infile,"r").read() + if "\nSQ" not in filetext: + print >> sys.stderr, "Exit: EMBL file not properly formatted, no sequence found." + logfile = open("antismash.log","w") + logfile.write("Exit: EMBL file not properly formatted, no sequence found.\n") + logfile.close() + sys.exit(1) + dnaseq = filetext.split("\nSQ")[1] + dnaseq = cleandnaseq(dnaseq) + sequence = dnaseq + if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)): + print >> sys.stderr, "Protein EMBL file provided. Please provide nucleotide EMBL file." + sys.exit(1) + fastafile = open(infile.rpartition(".")[0] + ".fasta","w") + fastafile.write(">" + infile.rpartition(".")[0] + "|\n") + fastafile.write(sequence) + fastafile.close() + infile = fastafile + elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK": + filetext = open(infile,"r").read() + if "\nORIGIN" not in filetext: + print >> sys.stderr, "Exit: GBK file not properly formatted, no sequence found." + logfile = open("antismash.log","w") + logfile.write("Exit: GBK file not properly formatted, no sequence found.\n") + logfile.close() + sys.exit(1) + dnaseq = filetext.split("\nORIGIN")[1] + dnaseq = cleandnaseq(dnaseq) + sequence = dnaseq + if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)): + print >> sys.stderr, "Protein GBK file provided. Please provide nucleotide GBK file." + sys.exit(1) + fastafile = open(infile.rpartition(".")[0] + ".fasta","w") + fastafile.write(">" + infile.rpartition(".")[0] + "\n") + fastafile.write(sequence) + fastafile.close() + infile = infile.rpartition(".")[0] + ".fasta" + #If input is unannotated fasta file, predict genes with Glimmer and create EMBL file. If input is EMBL or GBK file, read input embl/gbk and create input fasta file, read input protein info into memory + annotated = "y" + if infile.split(".")[-1] == "fasta" or infile.split(".")[-1] == "FASTA" or infile.split(".")[-1] == "FAS" or infile.split(".")[-1] == "fas" or infile.split(".")[-1] == "FA" or infile.split(".")[-1] == "fa": + annotated = "n" + #Check input file formatting + sequence = get_sequence(infile) + if (sequence.count('A') + sequence.count('a') + sequence.count('C') + sequence.count('c') + sequence.count('G') + sequence.count('g') + sequence.count('T') + sequence.count('t')) < (0.5 * len(sequence)): + print >> sys.stderr, "Protein FASTA file provided. Please provide nucleotide FASTA file." + sys.exit(1) + nucleotides = ["A","a","C","c","G","g","T","t","N","n"] + badsequence = "n" + sequence_name = open(infile,"r").read().partition(">")[2].partition("\n")[0] + for i in sequence: + if i not in nucleotides: + badsequence = "y" + if badsequence == "y": + cleaned_sequence = cleandnaseq(sequence) + badsequence = "n" + for i in cleaned_sequence: + if i not in nucleotides: + badsequence = "y" + if badsequence == "n": + writefasta([sequence_name],[cleaned_sequence],infile.rpartition(".")[0] + "_f.fasta") + infile = infile.rpartition(".")[0] + "_f.fasta" + else: + print >>sys.stderr, "Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file." + logfile = open("antismash.log","w") + logfile.write("Incorrect file formatting. Please submit a properly formatted single-sequence FASTA file.\n") + logfile.close() + sys.exit(1) + revseq = reverse_complement(sequence) + seqlength = len(sequence) + + #Print Glimmer notification + #if taxon == "p": + # print "Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome..." + #elif taxon == "e": + # print "Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome..." + logfile = open("antismash.log","w") + if taxon == "p": + logfile.write("Running Glimmer 3.02 to predict genes in unannotated prokaryotic genome...\n") + elif taxon == "e": + logfile.write("Running GlimmerHMM 3.0.1 to predict genes in unannotated eukaryotic genome...\n") + #logfile.close() + loginfo = open("antismash.log","r").read() + #logfile.close() + #Copying file and changing to folder to prepare for Glimmer3 prediction + os.mkdir( os.path.join(os.getcwd(), genomename, "geneprediction")) + if sys.platform == ('win32'): + os.system("copy/y " + infile + " geneprediction > nul") + if sys.platform == ('linux2'): + os.system("cp " + infile + " geneprediction > /dev/null") + + os.chdir( os.path.join(os.getcwd(), genomename, "geneprediction")) + fastafile = '../../'+infile + + #Find DNA sequence length + seq = get_sequence(fastafile) + dnaseqlength = len(seq) + #Run Glimmer for prokaryotic sequences, GlimmerHMM for eukaryotic sequences + if taxon == "p": + """ + GlimmerPrediction, not needed since we can predict it in galaxy on our own + if genomeconf == "l": + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs") + else: + os.system("tigr-glimmer long-orfs -l -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs") + else: + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs") + else: + os.system("tigr-glimmer long-orfs -n -t 1.15 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs") + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train") + else: + os.system("tigr-glimmer extract -t " + fastafile + " " + fastafile.rpartition(".")[0] + ".longorfs > " + fastafile.rpartition(".")[0] + ".train") + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train") + else: + os.system("tigr-glimmer build-icm -r " + fastafile.rpartition(".")[0] + ".icm < " + fastafile.rpartition(".")[0] + ".train") + if genomeconf == "l": + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0]) + else: + os.system("tigr-glimmer glimmer3 -l -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0]) + else: + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0]) + else: + os.system("tigr-glimmer glimmer3 -o50 -g" + minglength + " -q3000 -t30 --trans_table " + glimmertransl_table + " " + fastafile + " " + fastafile.rpartition(".")[0] + ".icm " + fastafile.rpartition(".")[0]) + #Convert glimmer predictions into EMBL with sequence + glfile = fastafile.rpartition(".")[0] + ".predict" + + Ende der Glimmer-Prediction + """ + glfile = glimmer_prediction_path + emblfile = fastafile.rpartition(".")[0] + ".embl" + try: + file = open(glfile,"r") + filetext = file.read() + except: + print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11." + logfile = open("antismash.log","w") + logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 11.\n") + logfile.close() + sys.exit(1) + if "orf" not in filetext: + print >> sys.stderr, "Glimmer gene prediction failed: no genes found." + logfile = open("antismash.log","w") + logfile.write("Glimmer gene prediction failed: no genes found.\n") + logfile.close() + sys.exit(1) + filetext = filetext.replace("\r","\n") + lines = filetext.split("\n") + lines = lines[1:-1] + orfnames = [] + starts = [] + ends = [] + strands = [] + starts2 = [] + ends2 = [] + firstline = "y" + for i in lines: + columns = i.split(" ") + columns2 = [] + for i in columns: + if i != "": + columns2.append(i) + columns = columns2 + if len(columns) > 3: + frame = columns[3][0] + strands.append(frame) + else: + frame = "" + if firstline == "y" and frame == "+" and len(columns) > 3: + orfname = str(columns[0]) + orfnames.append(orfname) + if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)): + gstart = (int(columns[2]) % 3) + 1 + if gstart == 3: + gstart = 0 + starts.append(str(gstart)) + ends.append(columns[2]) + starts.append(columns[1]) + ends.append(str(dnaseqlength)) + else: + starts.append(columns[1]) + ends.append(columns[2]) + firstline = "n" + elif firstline == "y" and frame == "-" and len(columns) > 3: + orfname = str(columns[0]) + orfnames.append(orfname) + if genomeconf == "c" and (int(columns[1]) > int(columns[2])) and (int(columns[1]) > (0.5 * dnaseqlength)): + gstart = (int(columns[2]) % 3) + 1 + if gstart == 3: + gstart = 0 + starts.append("complement(" + str(gstart)) + ends.append(columns[2] + ")") + starts.append("complement(" + columns[1]) + ends.append(str(dnaseqlength) + ")") + else: + complstart = "complement(" + str(columns[1]) + starts.append(complstart) + complend = str(columns[2]) + ")" + ends.append(str(complend)) + firstline = "n" + elif frame == "+" and len(columns) > 3: + orfname = str(columns[0]) + orfnames.append(orfname) + starts.append(columns[1]) + ends.append(columns[2]) + elif frame == "-" and len(columns) > 3: + orfname = str(columns[0]) + orfnames.append(orfname) + complstart = "complement(" + str(columns[1]) + starts.append(complstart) + complend = str(columns[2]) + ")" + ends.append(str(complend)) + if len(orfnames) == 0: + print >> sys.stderr, "Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10." + logfile = open("antismash.log","w") + logfile.write("Glimmer gene prediction failed. Please check the format of your input FASTA file. Error 10.\n") + logfile.close() + sys.exit(1) + out_file = open(emblfile,"w") + a = 0 + #print "Writing EMBL file with Glimmer-predicted genes..." + logfile = open("antismash.log","w") + logfile.write(loginfo) + logfile.write("Writing EMBL file with Glimmer-predicted genes...\n") + #logfile.close() + loginfo = open("antismash.log","r").read() + #logfile.close() + if taxon == "p": + out_file.write("ID A01; SV 1; linear; DNA; STD; PRO; " + str(dnaseqlength) + " BP.\nXX\n") + else: + out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n") + out_file.write("AC A01;\nXX\n") + out_file.write("DE " + genomename + ";\nXX\n") + out_file.write("KW none;\nXX\n") + out_file.write("OS unknown;\n") + if taxon == "p": + out_file.write("OC Eubacteria;\nXX\n") + else: + out_file.write("OC Fungi;\nXX\n") + out_file.write("RN [1]\n") + out_file.write("RT ;\n") + out_file.write("RL Unknown.\nXX\n") + out_file.write("FH Key Location/Qualifiers\nFH\n") + out_file.write("FT source 1.." + str(dnaseqlength) + "\n") + for i in orfnames: + out_file.write("FT gene ") + out_file.write(starts[a]) + out_file.write("..") + out_file.write(ends[a]) + out_file.write("\n") + out_file.write('FT /gene="' + i + '"\n') + out_file.write("FT CDS ") + out_file.write(starts[a]) + out_file.write("..") + out_file.write(ends[a]) + out_file.write("\n") + out_file.write('FT /gene="' + i + '"\n') + a += 1 + elif taxon == "e": + """ + GlimmerHMM is executed extern ... in galaxy and will be provided through glimmer_prediction_path + + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g") + else: + os.system("glimmerhmm " + fastafile + " train_crypto -o " + fastafile.rpartition(".")[0] + ".predict -g") + """ + #Convert glimmerhmm predictions into EMBL with sequence + #glfile = fastafile.rpartition(".")[0] + ".predict" + glfile = glimmer_prediction_path + emblfile = fastafile.rpartition(".")[0] + ".embl" + try: + file = open(glfile,"r") + filetext = file.read().replace("\r","") + except: + print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9." + logfile = open("antismash.log","w") + logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 9.\n") + logfile.close() + sys.exit(1) + if "CDS" not in filetext: + print >> sys.stderr, "GlimmerHMM gene prediction failed: no genes found." + logfile = open("antismash.log","w") + logfile.write("GlimmerHMM gene prediction failed: no genes found.\n") + logfile.close() + sys.exit(1) + filetext = filetext.replace("\r","\n") + lines = filetext.split("\n") + lines = lines[2:-1] + orfnames = [] + positions = [] + firstline = "y" + x = 0 + orfnr = 0 + starts = [] + ends = [] + for i in lines: + columns = i.split("\t") + if len(columns) > 1: + if x == 0: + strand = columns[6] + if "mRNA" not in i: + starts.append(columns[3]) + ends.append(columns[4]) + elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]: + strand = columns[6] + starts.append(columns[3]) + ends.append(columns[4]) + orfnr += 1 + if len(str(orfnr)) == 1: + orfname = "orf0000" + str(orfnr) + elif len(str(orfnr)) == 2: + orfname = "orf000" + str(orfnr) + elif len(str(orfnr)) == 3: + orfname = "orf00" + str(orfnr) + elif len(str(orfnr)) == 4: + orfname = "orf0" + str(orfnr) + elif len(str(orfnr)) == 5: + orfname = "orf" + str(orfnr) + orfnames.append(orfname) + if strand == "+": + if len(starts) == 1: + pos = starts[0] + ".." + ends[0] + positions.append(pos) + else: + pos = "join(" + y = 0 + for i in starts: + pos = pos + i + ".." + ends[y] + if i != starts[-1]: + pos = pos + "," + y += 1 + pos = pos + ")" + positions.append(pos) + elif strand == "-": + if len(starts) == 1: + pos = "complement(" + starts[0] + ".." + ends[0] + ")" + positions.append(pos) + else: + pos = "complement(join(" + y = 0 + for i in starts: + pos = pos + i + ".." + ends[y] + if i != starts[-1]: + pos = pos + "," + y += 1 + pos = pos + "))" + positions.append(pos) + starts = [] + ends = [] + elif "mRNA" not in i: + starts.append(columns[3]) + ends.append(columns[4]) + x += 1 + if len(orfnames) == 0: + print >> sys.stderr, "GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error: 12" + logfile = open("antismash.log","w") + logfile.write("GlimmerHMM gene prediction failed. Please check the format of your input FASTA file. Error 12\n") + logfile.close() + sys.exit(1) + out_file = open(emblfile,"w") + a = 0 + #print "Writing EMBL file with GlimmerHMM-predicted genes..." + logfile = open("antismash.log","w") + logfile.write(loginfo) + logfile.write("Writing EMBL file with GlimmerHMM-predicted genes...\n") + #logfile.close() + loginfo = open("antismash.log","r").read() + #logfile.close() + out_file.write("ID A01; SV 1; linear; DNA; STD; FUN; " + str(dnaseqlength) + " BP.\nXX\n") + out_file.write("AC A01;\nXX\n") + out_file.write("DE " + genomename + ";\nXX\n") + out_file.write("KW none;\nXX\n") + out_file.write("OS unknown;\n") + out_file.write("OC Fungi;\nXX\n") + out_file.write("RN [1]\n") + out_file.write("RT ;\n") + out_file.write("RL Unknown.\nXX\n") + out_file.write("FH Key Location/Qualifiers\nFH\n") + out_file.write("FT source 1.." + str(dnaseqlength) + "\n") + for i in orfnames: + out_file.write("FT gene ") + out_file.write(positions[a]) + out_file.write("\n") + out_file.write('FT /gene="' + i + '"\n') + out_file.write("FT CDS ") + out_file.write(positions[a]) + out_file.write("\n") + out_file.write('FT /gene="' + i + '"\n') + a += 1 + out_file.write("XX\nSQ Sequence " + str(dnaseqlength) + " BP; " + str(seq.count("a") + seq.count("A")) + " A; " + str(seq.count("c") + seq.count("C")) + " C; " + str(seq.count("g") + seq.count("G")) + " G; " + str(seq.count("t") + seq.count("T")) + " T; " + str(dnaseqlength - (seq.count("a") + seq.count("A") + seq.count("c") + seq.count("C") + seq.count("g") + seq.count("G") + seq.count("t") + seq.count("T"))) + " other;\n") + seq2 = seq + out_file.write(" ") + grouplen=10 + textlen = len(seq) + end = textlen - (textlen % grouplen) + repeated_iterator = [iter(itertools.islice(seq, 0, end))] * grouplen + parts = list(itertools.imap(lambda *chars: ''.join(chars),*repeated_iterator)) + if dnaseqlength%grouplen != 0: + parts.append(seq[-1 * (dnaseqlength%grouplen):]) + w = 1 + for l in parts: + out_file.write(l + " ") + if w == len(parts): + if w%6 == 0 and dnaseqlength%60 != 0: + out_file.write((" " * (10 - dnaseqlength%grouplen) + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//") + elif dnaseqlength%60 == 0: + out_file.write((" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//") + elif w%6 == 5 and dnaseqlength%grouplen == 0: + out_file.write((" " + " " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//") + elif dnaseqlength%grouplen != 0: + out_file.write(" " * (10 - dnaseqlength%grouplen) + " " * (6 - len(parts)%6) + " " * (6 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//") + else: + out_file.write(" " * (6 - len(parts)%6) + " " * (5 - len(parts)%6) + (" " * (10 - len(str(dnaseqlength)))) + str(dnaseqlength) + "\n//") + elif w%6 == 0: + out_file.write((" " * (10 - len(str(w * 10)))) + str(w * 10) + "\n ") + w += 1 + out_file.close() + os.chdir("../../") + infile = emblfile[6:] + emblfile = emblfile[6:] + if taxon == "p": + glimmeroutputfolder = genomename + "/glimmer/" + elif taxon == "e": + glimmeroutputfolder = genomename + "/glimmerhmm/" + try: + os.mkdir(glimmeroutputfolder) + except(IOError,OSError): + pass + proteins = embl2proteins(infile,sequence) + genomic_accnr = proteins[1] + dnaseqlength = proteins[2] + proteins = proteins[0] + writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta") + else: + #print "Reading embl/gbk file and creating input FASTA file for gene cluster detection..." + logfile.write("Reading embl/gbk file and creating input FASTA file for gene cluster detection...\n") + if infile.split(".")[-1] == "embl" or infile.split(".")[-1] == "EMBL" or infile.split(".")[-1] == "emb" or infile.split(".")[-1] == "EMB": + sequence = "" + proteins = embl2proteins(infile,sequence) + genomic_accnr = proteins[1] + dnaseqlength = proteins[2] + proteins = proteins[0] + writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta") + elif infile.split(".")[-1] == "gbk" or infile.split(".")[-1] == "GBK" or infile.split(".")[-1] == "gb" or infile.split(".")[-1] == "GB" or infile.split(".")[-1] == "genbank" or infile.split(".")[-1] == "GENBANK": + proteins = gbk2proteins(infile) + genomic_accnr = proteins[1] + dnaseqlength = proteins[2] + proteins = proteins[0] + writefasta(proteins[0],proteins[1],genomename + "/genome_proteins.fasta") + accessiondict = proteins[4] + seqdict = {} + fullnamedict = {} + strandsdict = {} + z = 0 + for i in proteins[0]: + name = i.split("|")[4] + seq = proteins[1][z] + seqdict[name] = seq + strand = i.split("|")[3] + strandsdict[name] = strand + fullnamedict[name] = i + z += 1 + + elapsed = (time.time() - starttime) + #print "2968Time since start: " + str(elapsed) + + #Run hmmsearch on proteins from input file and parse output + #print "Performing HMM search on proteins for detection of signature genes..." + logfile.write("Performing HMM search on proteins for detection of signature genes...\n") + hmmslist = ["AMP-binding.hmm","BLS.hmm","CAS.hmm","Chal_sti_synt_C.hmm","Chal_sti_synt_N.hmm","Condensation.hmm","ene_KS.hmm","hyb_KS.hmm","itr_KS.hmm","mod_KS.hmm","tra_KS.hmm","LANC_like.hmm","ATd.hmm","PKS_AT.hmm","PKS_KS.hmm","PP-binding.hmm","t2clf.hmm","t2ks.hmm","t2ks2.hmm","Terpene_synth.hmm","Terpene_synth_C.hmm","strH_like.hmm","neoL_like.hmm","DOIS.hmm","valA_like.hmm","spcFG_like.hmm","spcDK_like_cou.hmm","spcDK_like_glyc.hmm","strK_like1.hmm","strK_like2.hmm","bt1fas.hmm","ft1fas.hmm","t2fas.hmm","hglD.hmm","hglE.hmm","fabH.hmm","AfsA.hmm","IucA_IucC.hmm","ectoine_synt.hmm","phytoene_synt.hmm","Lant_dehyd_N.hmm","Lant_dehyd_C.hmm","Antimicrobial18.hmm","Gallidermin.hmm","L_biotic_typeA.hmm","LE-DUF.hmm","LE-LAC481.hmm","LE-LanBC.hmm","LE-MER+2PEP.hmm","MA-2PEPA.hmm","MA-DUF.hmm","MA-EPI.hmm","MA-LAC481.hmm","MA-NIS+EPI.hmm","MA-NIS.hmm","indsynth.hmm","A-OX.hmm","LmbU.hmm","MoeO5.hmm","LipM.hmm","LipU.hmm","LipV.hmm","ToyB.hmm","TunD.hmm","melC.hmm","strepbact.hmm","goadsporin_like.hmm","Antimicrobial14.hmm","Bacteriocin_IId.hmm","BacteriocIIc_cy.hmm","Bacteriocin_II.hmm","Lactococcin.hmm","Antimicrobial17.hmm","Lactococcin_972.hmm","Bacteriocin_IIc.hmm","LcnG-beta.hmm","Bacteriocin_IIi.hmm","Subtilosin_A.hmm","Cloacin.hmm","Neocarzinostat.hmm","Linocin_M18.hmm","TIGR03603.hmm","TIGR03604.hmm","TIGR03605.hmm","TIGR03731.hmm","TIGR03651.hmm","TIGR03678.hmm","TIGR03693.hmm","TIGR03798.hmm","TIGR03882.hmm","TIGR03601.hmm","TIGR03602.hmm","tabtoxin.hmm","cycdipepsynth.hmm","cyanobactin_synth.hmm","fom1.hmm","bcpB.hmm","frbD.hmm","mitE.hmm",'Lycopene_cycl.hmm','terpene_cyclase.hmm','NapT7.hmm','fung_ggpps.hmm','fung_ggpps2.hmm','dmat.hmm','trichodiene_synth.hmm','novK.hmm','novJ.hmm','novI.hmm','novH.hmm','pur6.hmm','pur10.hmm','nikJ.hmm','nikO.hmm','mvnA.hmm','thiostrepton.hmm','NAD_binding_4.hmm','vlmB.hmm','salQ.hmm','prnB.hmm'] + for i in hmmslist: + hmmsearch = hmmsearch_path + " " + "--cpu " + str(nrcpus) + " -o " + genomename + "/hmmoutput/" + i.split(".")[0] + "_output.txt" + " --noali --tblout " + genomename + "/hmmoutput/" + i.split(".")[0] + ".txt " + hmms_path + i + " " + genomename + "/genome_proteins.fasta" + os.system(hmmsearch) + #print "Parsing HMM outputs..." + logfile.write("Parsing HMM outputs...\n") + detecteddomainsdict = {} + #Extract type I PKS proteins, KS cut-off: 50; AT cut-off: 20; exclude those sequences that score higher on type I FAS HMMs, type IV hglE-like KS domains + t1pksprots = [] + transatpksprots = [] + if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes: + ks = parsehmmoutput(50,hmmoutputfolder + "PKS_KS.txt") + at = parsehmmoutput(50,hmmoutputfolder + "PKS_AT.txt") + ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt") + bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt") + hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt") + hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt") + fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt") + pksksprots = ks[0] + pksatprots = at[0] + pksatscores = at[1] + pksksscores = ks[1] + bt1fasprots = bt1fasks[0] + bt1fasscores = bt1fasks[1] + ft1fasprots = ft1fasks[0] + ft1fasscores = ft1fasks[1] + hgleprots = hgleks[0] + hglescores = hgleks[1] + hgldprots = hgldks[0] + hgldscores = hgldks[1] + fabhprots = fabhks[0] + fabhscores = fabhks[1] + for i in pksksprots: + exclude = "n" + score = pksksscores[pksksprots.index(i)] + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(score) < float(bt1fasscore): + exclude = "y" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(score) < float(ft1fasscore): + exclude = "y" + if i in hgldprots: + hgldscore = hgldscores[hgldprots.index(i)] + if float(score) < float(hgldscore): + exclude = "y" + if i in hgleprots: + hglescore = hglescores[hgleprots.index(i)] + if float(score) < float(hglescore): + exclude = "y" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(score) < float(fabhscore): + exclude = "y" + if i in pksatprots and exclude == "n": + t1pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]]) + detdomlist.append(["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["PKS acyltransferase domain",pksatscores[pksatprots.index(i)]]] + #Extract trans-AT PKSs: proteins with KS hits but without AT hits, and with trans-AT specific ATd-hits + atd = parsehmmoutput(65,hmmoutputfolder + "ATd.txt") + traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt") + traksprots = traks[0] + atdprots = atd[0] + atdscores = atd[1] + for i in pksksprots: + if i in atdprots and i in traksprots and i not in t1pksprots: + transatpksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]]) + detdomlist.append(["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["PKS ketosynthase domain",pksksscores[pksksprots.index(i)]],["Trans-AT PKS AT-docking domain",atdscores[atdprots.index(i)]]] + #Extract type II PKS & CLF proteins, KS-cut-off: 50, t2KS/clf score > modKS,eneKS,itrKS,traKS,t1fas,t2fas,hgle scores + t2pksprots = [] + if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes: + t2ks = parsehmmoutput(50,hmmoutputfolder + "t2ks.txt") + t2ks2 = parsehmmoutput(450,hmmoutputfolder + "t2ks2.txt") + t2clf = parsehmmoutput(50,hmmoutputfolder + "t2clf.txt") + eneks = parsehmmoutput(50,hmmoutputfolder + "ene_KS.txt") + hybks = parsehmmoutput(50,hmmoutputfolder + "hyb_KS.txt") + modks = parsehmmoutput(50,hmmoutputfolder + "mod_KS.txt") + itrks = parsehmmoutput(50,hmmoutputfolder + "itr_KS.txt") + traks = parsehmmoutput(50,hmmoutputfolder + "tra_KS.txt") + t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt") + ft1fasks = parsehmmoutput(50,hmmoutputfolder + "ft1fas.txt") + bt1fasks = parsehmmoutput(50,hmmoutputfolder + "bt1fas.txt") + hgleks = parsehmmoutput(50,hmmoutputfolder + "hglE.txt") + hgldks = parsehmmoutput(50,hmmoutputfolder + "hglD.txt") + fabhks = parsehmmoutput(50,hmmoutputfolder + "fabH.txt") + t2ksprots = t2ks[0] + t2ks2prots = t2ks2[0] + t2clfprots = t2clf[0] + eneksprots = eneks[0] + hybksprots = hybks[0] + modksprots = modks[0] + itrksprots = itrks[0] + traksprots = traks[0] + t2fasprots = t2fasks[0] + t2ksscores = t2ks[1] + t2ks2scores = t2ks2[1] + t2clfscores = t2clf[1] + eneksscores = eneks[1] + hybksscores = hybks[1] + modksscores = modks[1] + itrksscores = itrks[1] + traksscores = traks[1] + t2fasscores = t2fasks[1] + bt1fasprots = bt1fasks[0] + bt1fasscores = bt1fasks[1] + ft1fasprots = ft1fasks[0] + ft1fasscores = ft1fasks[1] + hgleprots = hgleks[0] + hglescores = hgleks[1] + hgldprots = hgldks[0] + hgldscores = hgldks[1] + fabhprots = fabhks[0] + fabhscores = fabhks[1] + for i in t2ksprots: + type2 = "y" + score = t2ksscores[t2ksprots.index(i)] + if i in eneksprots: + enescore = eneksscores[eneksprots.index(i)] + if float(enescore) > float(score): + type2 = "n" + if i in hybksprots: + hybscore = hybksscores[hybksprots.index(i)] + if float(hybscore) > float(score): + type2 = "n" + if i in modksprots: + modscore = modksscores[modksprots.index(i)] + if float(modscore) > float(score): + type2 = "n" + if i in itrksprots: + itrscore = itrksscores[itrksprots.index(i)] + if float(itrscore) > float(score): + type2 = "n" + if i in traksprots: + trascore = traksscores[traksprots.index(i)] + if float(trascore) > float(score): + type2 = "n" + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(bt1fasscore) > float(score): + type2 = "n" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(ft1fasscore) > float(score): + type2 = "n" + if i in t2fasprots: + t2fasscore = t2fasscores[t2fasprots.index(i)] + if float(t2fasscore) > float(score): + type2 = "n" + if i in hgleprots: + hglescore = hglescores[hgleprots.index(i)] + if float(hglescore) > float(score): + type2 = "n" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(fabhscore) > float(score): + type2 = "n" + if type2 == "y" and i not in t2pksprots and i not in t1pksprots: + t2pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Type II ketosynthase",t2ksscores[t2ksprots.index(i)]]] + for i in t2clfprots: + type2 = "y" + score = t2clfscores[t2clfprots.index(i)] + if i in eneksprots: + enescore = eneksscores[eneksprots.index(i)] + if float(enescore) > float(score): + type2 = "n" + if i in hybksprots: + hybscore = hybksscores[hybksprots.index(i)] + if float(hybscore) > float(score): + type2 = "n" + if i in modksprots: + modscore = modksscores[modksprots.index(i)] + if float(modscore) > float(score): + type2 = "n" + if i in itrksprots: + itrscore = itrksscores[itrksprots.index(i)] + if float(itrscore) > float(score): + type2 = "n" + if i in traksprots: + trascore = traksscores[traksprots.index(i)] + if float(trascore) > float(score): + type2 = "n" + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(bt1fasscore) > float(score): + type2 = "n" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(ft1fasscore) > float(score): + type2 = "n" + if i in t2fasprots: + t2fasscore = t2fasscores[t2fasprots.index(i)] + if float(t2fasscore) > float(score): + type2 = "n" + if i in hgleprots: + hglescore = hglescores[hgleprots.index(i)] + if float(hglescore) > float(score): + type2 = "n" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(fabhscore) > float(score): + type2 = "n" + if type2 == "y" and i not in t2pksprots and i not in t1pksprots: + t2pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Chain length factor",t2clfscores[t2clfprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Chain length factor",t2clfscores[t2clfprots.index(i)]]] + for i in t2ks2prots: + type2 = "y" + score = t2ks2scores[t2ks2prots.index(i)] + if i in eneksprots: + enescore = eneksscores[eneksprots.index(i)] + if float(enescore) > float(score): + type2 = "n" + if i in hybksprots: + hybscore = hybksscores[hybksprots.index(i)] + if float(hybscore) > float(score): + type2 = "n" + if i in modksprots: + modscore = modksscores[modksprots.index(i)] + if float(modscore) > float(score): + type2 = "n" + if i in itrksprots: + itrscore = itrksscores[itrksprots.index(i)] + if float(itrscore) > float(score): + type2 = "n" + if i in traksprots: + trascore = traksscores[traksprots.index(i)] + if float(trascore) > float(score): + type2 = "n" + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(bt1fasscore) > float(score): + type2 = "n" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(ft1fasscore) > float(score): + type2 = "n" + if i in t2fasprots: + t2fasscore = t2fasscores[t2fasprots.index(i)] + if float(t2fasscore) > float(score): + type2 = "n" + if i in hgleprots: + hglescore = hglescores[hgleprots.index(i)] + if float(hglescore) > float(score): + type2 = "n" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(fabhscore) > float(score): + type2 = "n" + if type2 == "y" and i not in t2pksprots and i not in t1pksprots: + t2pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Type II ketosynthase, model 2",t2ks2scores[t2ks2prots.index(i)]]] + #Extract type III PKS proteins + t3pksprots = [] + if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes: + t3n = parsehmmoutput(63,hmmoutputfolder + "Chal_sti_synt_N.txt") + t3c = parsehmmoutput(35,hmmoutputfolder + "Chal_sti_synt_C.txt") + t3nprots = t3n[0] + t3nscores = t3n[1] + t3cprots = t3c[0] + t3cscores = t3c[1] + for i in t3cprots: + if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots: + t3pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Chalcone/stilbene synthase,C-terminus",t3cscores[t3cprots.index(i)]]] + for i in t3nprots: + if i not in t3pksprots and i not in t1pksprots and i not in t2pksprots: + t3pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Chalcone/stilbene synthase,N-terminus",t3nscores[t3nprots.index(i)]]] + #Extract 'type IV' hglE-like PKS proteins, cut-off:50; only if not already scored as type 1-3 PKS, and not if FAS HMM has higher score + t4pksprots = [] + if 1 in geneclustertypes or 2 in geneclustertypes or 3 in geneclustertypes or 4 in geneclustertypes: + t2fasks = parsehmmoutput(50,hmmoutputfolder + "t2fas.txt") + t2fasprots = t2fasks[0] + t2fasscores = t2fasks[1] + for i in hgleprots: + type4 = "y" + score = hglescores[hgleprots.index(i)] + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(bt1fasscore) > float(score): + type4 = "n" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(ft1fasscore) > float(score): + type4 = "n" + if i in t2fasprots: + t2fasscore = t2fasscores[t2fasprots.index(i)] + if float(t2fasscore) > float(score): + type4 = "n" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(fabhscore) > float(score): + type4 = "n" + if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y": + t4pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Atypical PKS domain, HglE-like",hglescores[hgleprots.index(i)]]] + for i in hgldprots: + type4 = "y" + score = hgldscores[hgldprots.index(i)] + if i in bt1fasprots: + bt1fasscore = bt1fasscores[bt1fasprots.index(i)] + if float(bt1fasscore) > float(score): + type4 = "n" + if i in ft1fasprots: + ft1fasscore = ft1fasscores[ft1fasprots.index(i)] + if float(ft1fasscore) > float(score): + type4 = "n" + if i in t2fasprots: + t2fasscore = t2fasscores[t2fasprots.index(i)] + if float(t2fasscore) > float(score): + type4 = "n" + if i in fabhprots: + fabhscore = fabhscores[fabhprots.index(i)] + if float(fabhscore) > float(score): + type4 = "n" + if i not in t1pksprots and i not in t2pksprots and i not in t3pksprots and i not in transatpksprots and type4 == "y" and i not in t4pksprots: + t4pksprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Atypical PKS domain, HglD-like",hgldscores[hgldprots.index(i)]]] + #Extract NRPS proteins, C cut-off: 20; A cut-off:20, both should be there, or single domain proteins C,A, or T should be within 20kb of each other or a full NRPS + nrpsprots = [] + if 1 in geneclustertypes or 5 in geneclustertypes: + cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt") + amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt") + ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt") + ampoxprots = ampox[0] + ampoxscores = ampox[1] + for i in ampox[0]: + if i not in amp: + amp.append(i) + cprots = cond[0] + cscores = cond[1] + aprots = amp[0] + ascores = amp[1] + nrpsprots = [] + for i in cprots: + if i in aprots: + nrpsprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Condensation domain",cscores[cprots.index(i)]]) + if i in aprots: + detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]]) + elif i in ampoxprots: + detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in aprots: + detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain",ascores[aprots.index(i)]]] + elif i in ampoxprots: + detecteddomainsdict[i] = [["Condensation domain",cscores[cprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]] + for i in t1pksprots: + if i in aprots: + nrpsprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in aprots: + detdomlist.append(["Adenylation domain",ascores[aprots.index(i)]]) + elif i in ampoxprots: + detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in aprots: + detecteddomainsdict[i] = [["Adenylation domain",ascores[aprots.index(i)]]] + elif i in ampoxprots: + detecteddomainsdict[i] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]] + single_aprots = [] + single_cprots = [] + single_pptprots = [] + pptprots = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt")[0] + for i in aprots: + if i not in nrpsprots: + single_aprots.append(i) + for i in cprots: + if i not in nrpsprots: + single_cprots.append(i) + for i in pptprots: + if i not in nrpsprots: + single_pptprots.append(i) + genelist = proteins[2] + genedict = proteins[3] + single_aprots_positions = {} + single_cprots_positions = {} + single_pptprots_positions = {} + nrpsprots_positions = {} + for j in single_aprots: + if j in genelist: + protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])]) + protend_abs = max([int(genedict[j][0]),int(genedict[j][1])]) + single_aprots_positions[j] = int((protend_abs + protstart_abs) / 2) + for j in single_cprots: + if j in genelist: + protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])]) + protend_abs = max([int(genedict[j][0]),int(genedict[j][1])]) + single_cprots_positions[j] = int((protend_abs + protstart_abs) / 2) + for j in single_pptprots: + if j in genelist: + protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])]) + protend_abs = max([int(genedict[j][0]),int(genedict[j][1])]) + single_pptprots_positions[j] = int((protend_abs + protstart_abs) / 2) + for j in nrpsprots: + if j in genelist: + protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])]) + protend_abs = max([int(genedict[j][0]),int(genedict[j][1])]) + nrpsprots_positions[j] = int((protend_abs + protstart_abs) / 2) + nrpsprots2 = [] + for i in nrpsprots: + nrpsprots2.append(i) + for j in single_aprots: + include = "n" + pos = single_aprots_positions[j] + for i in single_cprots: + pos2 = single_cprots_positions[i] + if abs(pos - pos2) < 20000: + include = "y" + for i in nrpsprots2: + pos2 = nrpsprots_positions[i] + if abs(pos - pos2) < 20000: + include = "y" + if include == "y": + nrpsprots.append(j) + if detecteddomainsdict.has_key(j): + detdomlist = detecteddomainsdict[j] + if j in aprots: + detdomlist.append(["Adenylation domain",ascores[aprots.index(j)]]) + elif j in ampoxprots: + detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]) + detecteddomainsdict[j] = detdomlist + else: + if j in aprots: + detecteddomainsdict[j] = [["Adenylation domain",ascores[aprots.index(j)]]] + elif j in ampoxprots: + detecteddomainsdict[j] = [["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(j)]]] + for j in single_cprots: + include = "n" + pos = single_cprots_positions[j] + for i in single_aprots: + pos2 = single_aprots_positions[i] + if abs(pos - pos2) < 20000: + include = "y" + for i in nrpsprots2: + pos2 = nrpsprots_positions[i] + if abs(pos - pos2) < 20000: + include = "y" + if include == "y": + nrpsprots.append(j) + if detecteddomainsdict.has_key(j): + detdomlist = detecteddomainsdict[j] + detdomlist.append(["Condensation domain",cscores[cprots.index(j)]]) + detecteddomainsdict[j] = detdomlist + else: + detecteddomainsdict[j] = [["Condensation domain",cscores[cprots.index(j)]]] + #Extract Terpene synthase proteins, various cut-offs + terpeneprots = [] + if 1 in geneclustertypes or 6 in geneclustertypes: + terpene = parsehmmoutput(23,hmmoutputfolder + "Terpene_synth_C.txt") + terpeneprots = terpene[0] + terpenescores = terpene[1] + for i in terpeneprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Terpene synthase, C-terminus",terpenescores[terpeneprots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + physqualdata = parsehmmoutput(20,hmmoutputfolder + "phytoene_synt.txt") + physqualprots = physqualdata[0] + physqualscores = physqualdata[1] + for i in physqualprots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Phytoene/squalene synthase",physqualscores[physqualprots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + lycopenedata = parsehmmoutput(80,hmmoutputfolder + "Lycopene_cycl.txt") + lycopeneprots = lycopenedata[0] + lycopenescores = lycopenedata[1] + for i in lycopeneprots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Lycopene cyclase",lycopenescores[lycopeneprots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + terpene_cyclasesdata = parsehmmoutput(50,hmmoutputfolder + "terpene_cyclase.txt") + terpene_cyclases = terpene_cyclasesdata[0] + terpene_cyclases_scores = terpene_cyclasesdata[1] + for i in terpene_cyclases: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Terpene cyclase",terpene_cyclases_scores[terpene_cyclases.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + NapT7 = parsehmmoutput(250,hmmoutputfolder + "NapT7.txt") + NapT7prots = NapT7[0] + NapT7scores = NapT7[1] + for i in NapT7prots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NapT7",NapT7scores[NapT7prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NapT7",NapT7scores[NapT7prots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + fung_ggpps = parsehmmoutput(420,hmmoutputfolder + "fung_ggpps.txt") + fung_ggppsprots = fung_ggpps[0] + fung_ggppsscores = fung_ggpps[1] + for i in fung_ggppsprots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 1",fung_ggppsscores[fung_ggppsprots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + fung_ggpps2 = parsehmmoutput(312,hmmoutputfolder + "fung_ggpps2.txt") + fung_ggpps2prots = fung_ggpps2[0] + fung_ggpps2scores = fung_ggpps2[1] + for i in fung_ggpps2prots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Fungal geranylgeranyl pyrophosphate synthase, model 2",fung_ggpps2scores[fung_ggpps2prots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + dmat = parsehmmoutput(200,hmmoutputfolder + "dmat.txt") + dmatprots = dmat[0] + dmatscores = dmat[1] + for i in dmatprots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Dimethylallyl tryptophan synthase",dmatscores[dmatprots.index(i)]]] + if 1 in geneclustertypes or 6 in geneclustertypes: + trichodiene_synth = parsehmmoutput(150,hmmoutputfolder + "trichodiene_synth.txt") + trichodiene_synthprots = trichodiene_synth[0] + trichodiene_synthscores = trichodiene_synth[1] + for i in trichodiene_synthprots: + if i not in terpeneprots: + terpeneprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Trichodiene synthase",trichodiene_synthscores[trichodiene_synthprots.index(i)]]] + #Extract lantibiotic proteins, LanC cut-off: 80, Lant_dehN & Lant_dehC combination cut-off: 20 each + lantprots = [] + if 1 in geneclustertypes or 7 in geneclustertypes: + lantc = parsehmmoutput(80,hmmoutputfolder + "LANC_like.txt") + lancprots = lantc[0] + lancscores = lantc[1] + landehn = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_N.txt") + landehnprots = landehn[0] + landehnscores = landehn[1] + landehc = parsehmmoutput(20,hmmoutputfolder + "Lant_dehyd_C.txt") + landehcprots = landehc[0] + landehcscores = landehc[1] + lanti1 = parsehmmoutput(20,hmmoutputfolder + "Antimicrobial18.txt") + lanti1prots = lanti1[0] + lanti1scores = lanti1[1] + lanti2 = parsehmmoutput(20,hmmoutputfolder + "Gallidermin.txt") + lanti2prots = lanti2[0] + lanti2scores = lanti2[1] + lanti3 = parsehmmoutput(20,hmmoutputfolder + "L_biotic_typeA.txt") + lanti3prots = lanti3[0] + lanti3scores = lanti3[1] + lanti4 = parsehmmoutput(20,hmmoutputfolder + "LE-DUF.txt") + lanti4prots = lanti4[0] + lanti4scores = lanti4[1] + lanti5 = parsehmmoutput(20,hmmoutputfolder + "LE-LAC481.txt") + lanti5prots = lanti5[0] + lanti5scores = lanti5[1] + lanti6 = parsehmmoutput(20,hmmoutputfolder + "LE-LanBC.txt") + lanti6prots = lanti6[0] + lanti6scores = lanti6[1] + lanti7 = parsehmmoutput(20,hmmoutputfolder + "LE-MER+2PEP.txt") + lanti7prots = lanti7[0] + lanti7scores = lanti7[1] + lanti8 = parsehmmoutput(20,hmmoutputfolder + "MA-2PEPA.txt") + lanti8prots = lanti8[0] + lanti8scores = lanti8[1] + lanti9 = parsehmmoutput(20,hmmoutputfolder + "MA-DUF.txt") + lanti9prots = lanti9[0] + lanti9scores = lanti9[1] + lanti10 = parsehmmoutput(20,hmmoutputfolder + "MA-EPI.txt") + lanti10prots = lanti10[0] + lanti10scores = lanti10[1] + lanti11 = parsehmmoutput(20,hmmoutputfolder + "MA-LAC481.txt") + lanti11prots = lanti11[0] + lanti11scores = lanti11[1] + lanti12 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS+EPI.txt") + lanti12prots = lanti12[0] + lanti12scores = lanti12[1] + lanti13 = parsehmmoutput(20,hmmoutputfolder + "MA-NIS.txt") + lanti13prots = lanti13[0] + lanti13scores = lanti13[1] + lanti14 = parsehmmoutput(18,hmmoutputfolder + "TIGR03731.txt") + lanti14prots = lanti14[0] + lanti14scores = lanti14[1] + lantiprots = lanti1prots + lanti2prots + lanti3prots + lanti4prots + lanti5prots + lanti6prots + lanti7prots + lanti8prots + lanti9prots + lanti10prots + lanti11prots + lanti12prots + lanti13prots + lanti14prots + lantiprots2 = [] + for i in lantiprots: + if i not in lantiprots2: + lantiprots2.append(i) + lantiprots = lantiprots2 + for i in lancprots: + lantprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["LanC lanthionine synthase domain",lancscores[lancprots.index(i)]]] + for i in landehnprots: + if i in landehcprots and i not in lantprots: + lantprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]]) + detdomlist.append(["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Lantibiotic dehydratase, N-terminus",landehnscores[landehnprots.index(i)]],["Lantibiotic dehydratase, C-terminus",landehcscores[landehcprots.index(i)]]] + for i in lantiprots: + if i not in lantprots: + lantprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti1prots: + detdomlist.append(["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti1prots: + detecteddomainsdict[i] = [["Antimicrobial18 domain",lanti1scores[lanti1prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti2prots: + detdomlist.append(["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti2prots: + detecteddomainsdict[i] = [["Gallidermin domain",lanti2scores[lanti2prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti3prots: + detdomlist.append(["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti3prots: + detecteddomainsdict[i] = [["L_biotic_typeA domain",lanti3scores[lanti3prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti4prots: + detdomlist.append(["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti4prots: + detecteddomainsdict[i] = [["LE-DUF domain",lanti4scores[lanti4prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti5prots: + detdomlist.append(["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti5prots: + detecteddomainsdict[i] = [["LE-LAC481 domain",lanti5scores[lanti5prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti6prots: + detdomlist.append(["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti6prots: + detecteddomainsdict[i] = [["LE-LanBC domain",lanti6scores[lanti6prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti7prots: + detdomlist.append(["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti7prots: + detecteddomainsdict[i] = [["LE-MER+2PEP domain",lanti7scores[lanti7prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti8prots: + detdomlist.append(["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti8prots: + detecteddomainsdict[i] = [["MA-2PEPA domain",lanti8scores[lanti8prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti9prots: + detdomlist.append(["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti9prots: + detecteddomainsdict[i] = [["MA-DUF domain",lanti9scores[lanti9prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti10prots: + detdomlist.append(["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti10prots: + detecteddomainsdict[i] = [["MA-EPI domain",lanti10scores[lanti10prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti11prots: + detdomlist.append(["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti11prots: + detecteddomainsdict[i] = [["MA-LAC481 domain",lanti11scores[lanti11prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti12prots: + detdomlist.append(["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti12prots: + detecteddomainsdict[i] = [["MA-NIS+EPI domain",lanti12scores[lanti12prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti13prots: + detdomlist.append(["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti13prots: + detecteddomainsdict[i] = [["MA-NIS domain",lanti13scores[lanti13prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in lanti14prots: + detdomlist.append(["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in lanti14prots: + detecteddomainsdict[i] = [["TIGR03731: lantibiotic, gallidermin/nisin family",lanti14scores[lanti14prots.index(i)]]] + #Bacteriocin proteins, various cut-offs + bcinprots = [] + if 1 in geneclustertypes or 8 in geneclustertypes: + bcin1prots = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[0] + bcin2prots = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[0] + bcin3prots = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[0] + bcin4prots = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[0] + bcin5prots = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[0] + bcin6prots = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[0] + bcin7prots = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[0] + bcin8prots = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[0] + bcin9prots = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[0] + bcin10prots = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[0] + bcin11prots = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[0] + bcin12prots = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[0] + bcin13prots = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[0] + bcin14prots = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[0] + bcin15prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[0] + bcin16prots = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[0] + bcin17prots = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[0] + bcin18prots = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[0] + bcin19prots = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[0] + bcin20prots = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[0] + bcin21prots = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[0] + bcin22prots = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[0] + bcin23prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[0] + bcin24prots = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[0] + bcin25prots = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[0] + bcin26prots = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[0] + bcin1scores = parsehmmoutput(50,hmmoutputfolder + "strepbact.txt")[1] + bcin2scores = parsehmmoutput(90,hmmoutputfolder + "Antimicrobial14.txt")[1] + bcin3scores = parsehmmoutput(23,hmmoutputfolder + "Bacteriocin_IId.txt")[1] + bcin4scores = parsehmmoutput(92,hmmoutputfolder + "BacteriocIIc_cy.txt")[1] + bcin5scores = parsehmmoutput(40,hmmoutputfolder + "Bacteriocin_II.txt")[1] + bcin6scores = parsehmmoutput(24,hmmoutputfolder + "Lactococcin.txt")[1] + bcin7scores = parsehmmoutput(31,hmmoutputfolder + "Antimicrobial17.txt")[1] + bcin8scores = parsehmmoutput(25,hmmoutputfolder + "Lactococcin_972.txt")[1] + bcin9scores = parsehmmoutput(27,hmmoutputfolder + "Bacteriocin_IIc.txt")[1] + bcin10scores = parsehmmoutput(78,hmmoutputfolder + "LcnG-beta.txt")[1] + bcin11scores = parsehmmoutput(56,hmmoutputfolder + "Bacteriocin_IIi.txt")[1] + bcin12scores = parsehmmoutput(98,hmmoutputfolder + "Subtilosin_A.txt")[1] + bcin13scores = parsehmmoutput(27,hmmoutputfolder + "Cloacin.txt")[1] + bcin14scores = parsehmmoutput(25,hmmoutputfolder + "Linocin_M18.txt")[1] + bcin15scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03603.txt")[1] + bcin16scores = parsehmmoutput(440,hmmoutputfolder + "TIGR03604.txt")[1] + bcin17scores = parsehmmoutput(200,hmmoutputfolder + "TIGR03605.txt")[1] + bcin18scores = parsehmmoutput(18,hmmoutputfolder + "TIGR03651.txt")[1] + bcin19scores = parsehmmoutput(35,hmmoutputfolder + "TIGR03678.txt")[1] + bcin20scores = parsehmmoutput(400,hmmoutputfolder + "TIGR03693.txt")[1] + bcin21scores = parsehmmoutput(16,hmmoutputfolder + "TIGR03798.txt")[1] + bcin22scores = parsehmmoutput(150,hmmoutputfolder + "TIGR03882.txt")[1] + bcin23scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03601.txt")[1] + bcin24scores = parsehmmoutput(50,hmmoutputfolder + "TIGR03602.txt")[1] + bcin25scores = parsehmmoutput(20,hmmoutputfolder + "mvnA.txt")[1] + bcin26scores = parsehmmoutput(20,hmmoutputfolder + "thiostrepton.txt")[1] + bcinprots = bcin1prots + bcin2prots + bcin3prots + bcin4prots + bcin5prots + bcin6prots + bcin7prots + bcin8prots + bcin9prots + bcin10prots + bcin11prots + bcin12prots + bcin13prots + bcin14prots + bcin15prots + bcin16prots + bcin17prots + bcin18prots + bcin19prots + bcin20prots + bcin21prots + bcin22prots + bcin23prots + bcin24prots + bcin25prots + bcin26prots + bcinprots2 = [] + for i in bcinprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin1prots: + detdomlist.append(["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin1prots: + detecteddomainsdict[i] = [["Putative Streptomyces bacteriocin",bcin1scores[bcin1prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin2prots: + detdomlist.append(["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin2prots: + detecteddomainsdict[i] = [["Antimicrobial14 domain",bcin2scores[bcin2prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin3prots: + detdomlist.append(["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin3prots: + detecteddomainsdict[i] = [["Bacteriocin_IId domain",bcin3scores[bcin3prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin4prots: + detdomlist.append(["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin4prots: + detecteddomainsdict[i] = [["BacteriocIIc_cy domain",bcin4scores[bcin4prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin5prots: + detdomlist.append(["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin5prots: + detecteddomainsdict[i] = [["Bacteriocin_II domain",bcin5scores[bcin5prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin6prots: + detdomlist.append(["Lactococcin",bcin6scores[bcin6prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin6prots: + detecteddomainsdict[i] = [["Lactococcin",bcin6scores[bcin6prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin7prots: + detdomlist.append(["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin7prots: + detecteddomainsdict[i] = [["Antimicrobial17 domain",bcin7scores[bcin7prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin8prots: + detdomlist.append(["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin8prots: + detecteddomainsdict[i] = [["Lactococcin_972 domain",bcin8scores[bcin8prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin9prots: + detdomlist.append(["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin9prots: + detecteddomainsdict[i] = [["Bacteriocin_IIc domain",bcin9scores[bcin9prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin10prots: + detdomlist.append(["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin10prots: + detecteddomainsdict[i] = [["LcnG-beta domain",bcin10scores[bcin10prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin11prots: + detdomlist.append(["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin11prots: + detecteddomainsdict[i] = [["Bacteriocin_IIi domain",bcin11scores[bcin11prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin12prots: + detdomlist.append(["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin12prots: + detecteddomainsdict[i] = [["Subtilosin_A domain",bcin12scores[bcin12prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin13prots: + detdomlist.append(["Cloacin domain",bcin13scores[bcin13prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin13prots: + detecteddomainsdict[i] = [["Cloacin domain",bcin13scores[bcin13prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin14prots: + detdomlist.append(["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin14prots: + detecteddomainsdict[i] = [["Linocin_M18 domain",bcin14scores[bcin14prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin15prots: + detdomlist.append(["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin15prots: + detecteddomainsdict[i] = [["TIGR03603: bacteriocin biosynthesis cyclodehydratase",bcin15scores[bcin15prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin16prots: + detdomlist.append(["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin16prots: + detecteddomainsdict[i] = [["TGIR03604: bacteriocin biosynthesis docking scaffold",bcin16scores[bcin16prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin17prots: + detdomlist.append(["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin17prots: + detecteddomainsdict[i] = [["TGIR03605: SagB-type dehydrogenase",bcin17scores[bcin17prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin18prots: + detdomlist.append(["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin18prots: + detecteddomainsdict[i] = [["TIGR03651: bacteriocin, circularin A/uberolysin family",bcin18scores[bcin18prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin19prots: + detdomlist.append(["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin19prots: + detecteddomainsdict[i] = [["TIGR03678: bacteriocin, microcyclamide/patellamide family",bcin19scores[bcin19prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin20prots: + detdomlist.append(["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin20prots: + detecteddomainsdict[i] = [["TIGR03693: thiazole-containing bacteriocin maturation protein",bcin20scores[bcin20prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin21prots: + detdomlist.append(["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin21prots: + detecteddomainsdict[i] = [["TIGR03798: bacteriocin propeptide",bcin21scores[bcin21prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin22prots: + detdomlist.append(["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin22prots: + detecteddomainsdict[i] = [["TIGR03882: bacteriocin biosynthesis cyclodehydratase",bcin22scores[bcin22prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin23prots: + detdomlist.append(["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin23prots: + detecteddomainsdict[i] = [["TIGR03601: bacteriocin, BA_2677 family",bcin23scores[bcin23prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin24prots: + detdomlist.append(["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin24prots: + detecteddomainsdict[i] = [["TIGR03602: bacteriocin protoxin, streptolysin S family",bcin24scores[bcin24prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin25prots: + detdomlist.append(["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin25prots: + detecteddomainsdict[i] = [["Bacteriocin, microviridin family",bcin25scores[bcin25prots.index(i)]]] + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + if i in bcin26prots: + detdomlist.append(["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in bcin26prots: + detecteddomainsdict[i] = [["Thiopeptide, thiostrepton-like",bcin26scores[bcin26prots.index(i)]]] + if i not in bcinprots2: + bcinprots2.append(i) + bcinprots = bcinprots2 + #Extract beta-lactam synthetase proteins, cut-off: 250 + lactamprots = [] + if 1 in geneclustertypes or 9 in geneclustertypes: + bls = parsehmmoutput(250,hmmoutputfolder + "BLS.txt") + blsprots = bls[0] + blsscores = bls[1] + for i in bls[0]: + lactamprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Beta-lactam synthase",blsscores[blsprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Beta-lactam synthase",blsscores[blsprots.index(i)]]] + cas = parsehmmoutput(250,hmmoutputfolder + "CAS.txt") + casprots = cas[0] + casscores = cas[1] + for i in cas[0]: + if i not in lactamprots: + lactamprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Clavulanic acid synthase-like",casscores[casprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Clavulanic acid synthase-like",casscores[casprots.index(i)]]] + tabtoxin = parsehmmoutput(500,hmmoutputfolder + "tabtoxin.txt") + tabtoxinprots = tabtoxin[0] + tabtoxinscores = tabtoxin[1] + for i in tabtoxin[0]: + if i not in lactamprots: + lactamprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Tabtoxin synthase-like",tabtoxinscores[tabtoxinprots.index(i)]]] + #Extract aminoglycoside / aminocyclitol biosynthesis clusters, clusters taken from Flatt & Mahmud et al. 2007 + amglyccyclprots = [] + if 1 in geneclustertypes or 10 in geneclustertypes: + strH = parsehmmoutput(200,hmmoutputfolder + "strH_like.txt") + strhprots = strH[0] + strhscores = strH[1] + for i in strH[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["StrH-like glycosyltransferase",strhscores[strhprots.index(i)]]] + strK1 = parsehmmoutput(800,hmmoutputfolder + "strK_like1.txt") + strk1prots = strK1[0] + strk1scores = strK1[1] + for i in strK1[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["StrK-like phosphatase",strk1scores[strk1prots.index(i)]]] + strK2 = parsehmmoutput(650,hmmoutputfolder + "strK_like2.txt") + strk2prots = strK2[0] + strk2scores = strK2[1] + for i in strK2[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["StrK-like phosphatase, model 2",strk2scores[strk2prots.index(i)]]] + neoL = parsehmmoutput(50,hmmoutputfolder + "neoL_like.txt") + neolprots = neoL[0] + neolscores = neoL[1] + for i in neoL[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NeoL-like deacetylase",neolscores[neolprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NeoL-like deacetylase",neolscores[neolprots.index(i)]]] + DOIS = parsehmmoutput(500,hmmoutputfolder + "DOIS.txt") + doisprots = DOIS[0] + doisscores = DOIS[1] + for i in DOIS[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["2-deoxy-scyllo-inosose synthase",doisscores[doisprots.index(i)]]] + valA = parsehmmoutput(600,hmmoutputfolder + "valA_like.txt") + valaprots = valA[0] + valascores = valA[1] + for i in valA[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, ValA-like",valascores[valaprots.index(i)]]] + spcFG = parsehmmoutput(200,hmmoutputfolder + "spcFG_like.txt") + spcfgprots = spcFG[0] + spcfgscores = spcFG[1] + for i in spcFG[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["SpcF/SpcG-like glycosyltransferase",spcfgscores[spcfgprots.index(i)]]] + spcDK_glyc = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_glyc.txt") + spcdkglycprots = spcDK_glyc[0] + spcdkglycscores = spcDK_glyc[1] + for i in spcDK_glyc[0]: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase",spcdkglycscores[spcdkglycprots.index(i)]]] + salQ = parsehmmoutput(480,hmmoutputfolder + "salQ.txt") + salqprots = salQ[0] + salqscores = salQ[1] + for i in salqprots: + amglyccyclprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["2-epi-5-epi-valiolone synthase, SalQ-like",salqscores[salqprots.index(i)]]] + #Extract aminocoumarin biosynthesis clusters + aminocoumarinprots = [] + if 1 in geneclustertypes or 11 in geneclustertypes: + novK = parsehmmoutput(200,hmmoutputfolder + "novK.txt") + novkprots = novK[0] + novkscores = novK[1] + for i in novkprots: + aminocoumarinprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NovK-like reductase",novkscores[novkprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NovK-like reductase",novkscores[novkprots.index(i)]]] + novJ = parsehmmoutput(350,hmmoutputfolder + "novJ.txt") + novjprots = novJ[0] + novjscores = novJ[1] + for i in novjprots: + aminocoumarinprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NovJ-like reductase",novjscores[novjprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NovJ-like reductase",novjscores[novjprots.index(i)]]] + novI = parsehmmoutput(600,hmmoutputfolder + "novI.txt") + noviprots = novI[0] + noviscores = novI[1] + for i in noviprots : + aminocoumarinprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NovI-like cytochrome P450",noviscores[noviprots.index(i)]]] + novH = parsehmmoutput(750,hmmoutputfolder + "novH.txt") + novhprots = novH[0] + novhscores = novH[1] + for i in novhprots: + aminocoumarinprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NovH-like protein",novhscores[novhprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NovH-like protein",novhscores[novhprots.index(i)]]] + spcDK_like_cou = parsehmmoutput(600,hmmoutputfolder + "spcDK_like_cou.txt") + spcDK_like_cou_prots = spcDK_like_cou[0] + spcDK_like_cou_scores = spcDK_like_cou[1] + for i in spcDK_like_cou_prots: + aminocoumarinprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["SpcD/SpcK-like thymidylyltransferase, aminocoumarins group",spcDK_like_cou_scores[spcDK_like_cou_prots.index(i)]]] + #Extract siderophores biosynthesis proteins, IucA/C and AlcB + siderophoreprots = [] + if 1 in geneclustertypes or 12 in geneclustertypes: + siderophore = parsehmmoutput(30,hmmoutputfolder + "IucA_IucC.txt") + siderophoreprots = siderophore[0] + siderophorescores = siderophore[1] + for i in siderophoreprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["IucA-IucC domain",siderophorescores[siderophoreprots.index(i)]]] + #Extract ectoine biosynthesis proteins + ectprots = [] + if 1 in geneclustertypes or 13 in geneclustertypes: + ect = parsehmmoutput(35,hmmoutputfolder + "ectoine_synt.txt") + ectprots = ect[0] + ectscores = ect[1] + for i in ectprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Ectoine synthase",ectscores[ectprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Ectoine synthase",ectscores[ectprots.index(i)]]] + #Extract butyrolactone biosynthesis proteins + butyrprots = [] + if 1 in geneclustertypes or 14 in geneclustertypes: + butyr= parsehmmoutput(25,hmmoutputfolder + "AfsA.txt") + butyrprots = butyr[0] + butyrscores = butyr[1] + for i in butyrprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["AfsA butyrolactone synthesis domain",butyrscores[butyrprots.index(i)]]] + #Extract indole biosynthesis proteins + indoleprots = [] + if 1 in geneclustertypes or 15 in geneclustertypes: + indole = parsehmmoutput(100,hmmoutputfolder + "indsynth.txt") + indoleprots = indole[0] + indolescores = indole[1] + for i in indoleprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["StaD-like chromopyrrolic acid synthase domain",indolescores[indoleprots.index(i)]]] + #Extract nucleoside antibiotic biosynthesis proteins + nucleoprots = [] + if 1 in geneclustertypes or 16 in geneclustertypes: + nucleoprots = [] + lipm = parsehmmoutput(50,hmmoutputfolder + "LipM.txt") + lipmprots = lipm[0] + lipmscores = lipm[1] + lipu = parsehmmoutput(30,hmmoutputfolder + "LipU.txt") + lipuprots = lipu[0] + lipuscores = lipu[1] + lipv = parsehmmoutput(375,hmmoutputfolder + "LipV.txt") + lipvprots = lipv[0] + lipvscores = lipv[1] + toyb = parsehmmoutput(175,hmmoutputfolder + "ToyB.txt") + toybprots = toyb[0] + toybscores = toyb[1] + tund = parsehmmoutput(200,hmmoutputfolder + "TunD.txt") + tundprots = tund[0] + tundscores = tund[1] + pur6 = parsehmmoutput(200,hmmoutputfolder + "pur6.txt") + pur6prots = pur6[0] + pur6scores = pur6[1] + pur10 = parsehmmoutput(600,hmmoutputfolder + "pur10.txt") + pur10prots = pur10[0] + pur10scores = pur10[1] + nikj = parsehmmoutput(200,hmmoutputfolder + "nikJ.txt") + nikjprots = nikj[0] + nikjscores = nikj[1] + niko = parsehmmoutput(400,hmmoutputfolder + "nikO.txt") + nikoprots = niko[0] + nikoscores = niko[1] + for i in lipmprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["LipM-like nucleotidyltransferase",lipmscores[lipmprots.index(i)]]] + for i in lipuprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["LipU-like protein",lipuscores[lipuprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["LipU-like protein",lipuscores[lipuprots.index(i)]]] + for i in lipvprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["LipV-like dehydrogenase",lipvscores[lipvprots.index(i)]]] + for i in toybprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["ToyB-like synthase",toybscores[toybprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["ToyB-like synthase",toybscores[toybprots.index(i)]]] + for i in tundprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["TunD-like putative N-acetylglucosamine transferase",tundscores[tundprots.index(i)]]] + for i in pur6prots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Pur6-like synthetase",pur6scores[pur6prots.index(i)]]] + for i in pur10prots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Pur10-like oxidoreductase",pur10scores[pur10prots.index(i)]]] + for i in nikjprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NikJ-like protein",nikjscores[nikjprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["NikJ-like protein",nikjscores[nikjprots.index(i)]]] + for i in nikoprots: + if i not in nucleoprots: + nucleoprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + + detecteddomainsdict[i] = [["NikO-like enolpyruvyl transferase",nikoscores[nikoprots.index(i)]]] + #Extract phosphoglycolipid biosynthesis proteins + phosphoprots = [] + if 1 in geneclustertypes or 17 in geneclustertypes: + phosphogl = parsehmmoutput(65,hmmoutputfolder + "MoeO5.txt") + phosphoprots = phosphogl[0] + phosphoscores = phosphogl[1] + for i in phosphoprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["MoeO5-like prenyl-3-phosphoglycerate synthase",phosphoscores[phosphoprots.index(i)]]] + #Extract melanin biosynthesis proteins + melaninprots = [] + if 1 in geneclustertypes or 18 in geneclustertypes: + melanin = parsehmmoutput(40,hmmoutputfolder + "melC.txt") + melaninprots = melanin[0] + melaninscores = melanin[1] + for i in melaninprots: + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["MelC-like melanin synthase",melaninscores[melaninprots.index(i)]]] + #Extract other putative secondary metabolite biosynthesis proteins + otherprots = [] + amp_t_prots = [] + if 1 in geneclustertypes or 19 in geneclustertypes: + pptb = parsehmmoutput(20,hmmoutputfolder + "PP-binding.txt") + pptbprots = pptb[0] + pptbscores = pptb[1] + cond = parsehmmoutput(20,hmmoutputfolder + "Condensation.txt") + amp = parsehmmoutput(20,hmmoutputfolder + "AMP-binding.txt") + ampprots = amp[0] + ampscores = amp[1] + ampox = parsehmmoutput(50,hmmoutputfolder + "A-OX.txt") + ampoxprots = ampox[0] + ampoxscores = ampox[1] + nad4 = parsehmmoutput(40,hmmoutputfolder + "NAD_binding_4.txt") + nad4prots = nad4[0] + nad4scores = nad4[1] + cprots = cond[0] + aprots = amp[0] + for i in ampox[0]: + if i not in aprots: + aprots.append(i) + nrpsprots2 = [] + for i in cprots: + if i in aprots: + nrpsprots2.append(i) + tprots = pptb[0] + for i in tprots: + if i in aprots and i not in nrpsprots2 and i not in aminocoumarinprots: + otherprots.append(i) + amp_t_prots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["PP-binding domain",pptbscores[pptbprots.index(i)]]) + if i in ampprots: + detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]]) + elif i in ampoxprots: + detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in ampprots: + detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]] + elif i in ampoxprots: + detecteddomainsdict[i] = [["PP-binding domain",pptbscores[pptbprots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]] + for i in nad4prots: + if i in aprots and i not in aminocoumarinprots: + otherprots.append(i) + amp_t_prots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["NAD-binding domain 4",nad4scores[nad4prots.index(i)]]) + if i in ampprots: + detdomlist.append(["Adenylation domain",ampscores[ampprots.index(i)]]) + elif i in ampoxprots: + detdomlist.append(["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + if i in ampprots: + detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain",ampscores[ampprots.index(i)]]] + elif i in ampoxprots: + detecteddomainsdict[i] = [["NAD-binding domain 4",nad4scores[nad4prots.index(i)]],["Adenylation domain with integrated oxidase",ampoxscores[ampoxprots.index(i)]]] + lmbu = parsehmmoutput(50,hmmoutputfolder + "LmbU.txt") + lmbuprots = lmbu[0] + lmbuscores = lmbu[1] + for i in lmbuprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["LmbU-like protein",lmbuscores[lmbuprots.index(i)]]] + goadsporin = parsehmmoutput(500,hmmoutputfolder + "goadsporin_like.txt") + goadsporinprots = goadsporin[0] + goadsporinscores = goadsporin[1] + for i in goadsporinprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Goadsporin-like protein",goadsporinscores[goadsporinprots.index(i)]]] + neocarzinostat = parsehmmoutput(28,hmmoutputfolder + "Neocarzinostat.txt") + neocarzinostatprots = neocarzinostat[0] + neocarzinostatscores = neocarzinostat[1] + for i in neocarzinostatprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Neocarzinostatin-like protein",neocarzinostatscores[neocarzinostatprots.index(i)]]] + cyanobactin = parsehmmoutput(80,hmmoutputfolder + "cyanobactin_synth.txt") + cyanobactinprots = cyanobactin[0] + cyanobactinscores = cyanobactin[1] + for i in cyanobactinprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Cyanobactin protease",cyanobactinscores[cyanobactinprots.index(i)]]] + cycdipeptide = parsehmmoutput(110,hmmoutputfolder + "cycdipepsynth.txt") + cycdipeptideprots = cycdipeptide[0] + cycdipeptidescores = cycdipeptide[1] + for i in cycdipeptideprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Cyclodipeptide synthase",cycdipeptidescores[cycdipeptideprots.index(i)]]] + fom1 = parsehmmoutput(750,hmmoutputfolder + "fom1.txt") + fom1prots = fom1[0] + fom1scores = fom1[1] + for i in fom1prots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Fom1-like phosphomutase",fom1scores[fom1prots.index(i)]]] + bcpb = parsehmmoutput(400,hmmoutputfolder + "bcpB.txt") + bcpbprots = bcpb[0] + bcpbscores = bcpb[1] + for i in bcpbprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["BcpB-like phosphomutase",bcpbscores[bcpbprots.index(i)]]] + frbd = parsehmmoutput(350,hmmoutputfolder + "frbD.txt") + frbdprots = frbd[0] + frbdscores = frbd[1] + for i in frbdprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["FrbD-like phosphomutase",frbdscores[frbdprots.index(i)]]] + mite = parsehmmoutput(400,hmmoutputfolder + "mitE.txt") + miteprots = mite[0] + mitescores = mite[1] + for i in miteprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["MitE-like CoA-ligase",mitescores[miteprots.index(i)]]] + vlmb = parsehmmoutput(250,hmmoutputfolder + "vlmB.txt") + vlmbprots = vlmb[0] + vlmbscores = vlmb[1] + for i in vlmbprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Valanimycin biosynthesis VlmB domain",vlmbscores[vlmbprots.index(i)]]] + prnb = parsehmmoutput(200,hmmoutputfolder + "prnB.txt") + prnbprots = prnb[0] + prnbscores = prnb[1] + for i in prnbprots: + if i not in otherprots: + otherprots.append(i) + if detecteddomainsdict.has_key(i): + detdomlist = detecteddomainsdict[i] + detdomlist.append(["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]) + detecteddomainsdict[i] = detdomlist + else: + detecteddomainsdict[i] = [["Pyrrolnitrin biosynthesis PrnB domain",prnbscores[prnbprots.index(i)]]] + if 5 not in geneclustertypes and 1 not in geneclustertypes: + nrpsprots = [] + if 4 not in geneclustertypes and 1 not in geneclustertypes: + t3pksprots = [] + if 3 not in geneclustertypes and 1 not in geneclustertypes: + t2pksprots = [] + if 2 not in geneclustertypes and 1 not in geneclustertypes: + t1pksprots = [] + t4pksprots = [] + transatpksprots = [] + #Assemble all core sec met proteins + allsecmetprots = [] + for i in t1pksprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in transatpksprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in t2pksprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in t3pksprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in t4pksprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in nrpsprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in terpeneprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in lantprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in bcinprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in lactamprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in amglyccyclprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in siderophoreprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in ectprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in butyrprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in indoleprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in nucleoprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in phosphoprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in melaninprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in aminocoumarinprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + for i in otherprots: + if i not in allsecmetprots: + allsecmetprots.append(i) + allsecmetprots.sort() + + if len(allsecmetprots) == 0: + logfile.write("No secondary metabolite biosynthesis gene clusters detected in this nucleotide file.\n") + logfile.close() + print >> sys.stderr, "No secondary metabolite biosynthesis gene clusters detected in this nucleotide file." + sys.exit(1) + + elapsed = (time.time() - starttime) + #print "4713Time since start: " + str(elapsed) + + #Extract approximate gene clusters based on hmmsearch results, create list of core PKS / NRPS genes for further analysis (use less strict parameters for this then in gene cluster detection to include all PKS/NRPS domains) + #Create nucleotide fasta files with sec met gene clusters + #print "Extracting gene clusters from gbk/embl file using detected signature genes..." + logfile.write("Extracting gene clusters from gbk/embl file using detected signature genes...\n") + fastafile = open(genomename + "/clusterblast/geneclusterprots.fasta","w") + txtfile = open(genomename + "/clusterblast/geneclusters.txt","w") + wb = Workbook() + font1 = Font() + style1 = XFStyle() + style1.font = font1 + font1.bold = True + ws0 = wb.add_sheet('0') + ws0.write(0,0,"Input accession number",style1) + ws0.write(0,1,"Input name",style1) + ws0.write(0,2,"Gene cluster type",style1) + ws0.write(0,3,"Gene cluster genes",style1) + if clusterblast == "y": + ws0.write(0,4,"Compound with gene cluster of highest homology",style1) + protcodes = allsecmetprots + nuccode = genomename + gbkfile = open(infile,"r") + output = gbkfile.read() + output = output.replace("\r","\n") + #Extract description of nucleotide from gbk/embl file + if ".gbk" in infile or ".GBK" in infile or ".gb" in infile or ".GB" in infile or ".genbank" in infile or ".GENBANK" in infile: + try: + nucname1 = output.split("ACCESSION ")[0] + nucname2 = nucname1.split("DEFINITION ")[1] + nucname3 = nucname2.replace("\n","") + while " " in nucname3: + nucname3 = nucname3.replace(" "," ") + nucname = nucname3 + except(KeyError,IOError,IndexError): + nucname = "input_nucleotide" + elif ".embl" in infile or ".EMBL" in infile or ".emb" in infile or ".EMB" in infile: + try: + nucname1 = output.split("DE ")[1] + nucname2 = nucname1.split("\n")[0] + nucname3 = nucname2.replace("\n","") + while " " in nucname3: + nucname3 = nucname3.replace(" "," ") + nucname = nucname3 + except(KeyError,IOError,IndexError): + nucname = "input_nucleotide" + protstartlocations = [] + protendlocations = [] + genelist = proteins[2] + genedict = proteins[3] + #Save all locations of query proteins on the nucleotide in a list + for j in protcodes: + if j in genelist: + protstart_abs = min([int(genedict[j][0]),int(genedict[j][1])]) + protend_abs = max([int(genedict[j][0]),int(genedict[j][1])]) + protstartlocations.append(protstart_abs) + protendlocations.append(protend_abs) + #Identify clusters of genes based on protein locations on the nucleotide + clusterstarts = [] + clusterends = [] + protstartlocations.sort() + protendlocations.sort() + nrlocations = len(protstartlocations) + a = 0 + for i in protstartlocations: + if a == 0: + start = str(i) + clusterstarts.append(start) + if len(protendlocations) == 1: + clusterends.append(protendlocations[a]) + elif a == nrlocations - 1: + if i < ((protendlocations[a - 1]) + 20000): + clusterends.append(str(protendlocations[a])) + else: + end = str(protendlocations[a - 1]) + clusterends.append(end) + clusterstarts.append(str(i)) + clusterends.append(str(protendlocations[a])) + else: + if i > ((protendlocations[a - 1]) + 20000): + clusterends.append(str(protendlocations[a - 1])) + start = str(i) + clusterstarts.append(start) + else: + pass + a += 1 + lastendlocation = i + #Extend clusters with 20kb on each side of the identified core genes + clusterstarts2 = [] + for i in clusterstarts: + j = int(i) - 20000 + if j < 0: + j = 0 + clusterstarts2.append(j) + clusterstarts = clusterstarts2 + clusterends2 = [] + for i in clusterends: + j = int(i) + 20000 + clusterends2.append(j) + clusterends = clusterends2 + #For each genbank secondary metabolite gene cluster: extract all proteins and write to fasta, + a = 0 + clusterinfo = {} + geneclusters = [] + geneclustergenes = [] + allcoregenes = [] + for i in clusterstarts: + cstart = int(i) + cend = int(clusterends[a]) + a += 1 + clusternr = a + geneclusters.append(clusternr) + coregenes = [] + clustergenes = [] + #For each gene in nucleotide, check if it is inside this cluster; if, so append info to list of clustergenes + if a == 1: + for i in genelist: + geneinfo = genedict[i][:-1] + geneinfo.append(i) + genedict[i] = geneinfo + for i in genelist: + geneinfo = genedict[i] + genestart = int(geneinfo[0]) + geneend = int(geneinfo[1]) + if (genestart > cstart and genestart < cend) or (geneend > cstart and geneend < cend): + clustergenes.append(geneinfo) + #Determine type of cluster + type = "other" + z = 0 + for k in clustergenes: + i = k[4] + if i in t1pksprots: + if z == 0: + type = "t1pks" + elif "t1pks" not in type: + type = type + "-t1pks" + z = 1 + if i in transatpksprots: + if z == 0: + type = "transatpks" + elif "transatpks" not in type: + type = type + "-transatpks" + z = 1 + if i in t2pksprots: + if z == 0: + type = "t2pks" + elif "t2pks" not in type: + type = type + "-t2pks" + z = 1 + if i in t3pksprots: + if z == 0: + type = "t3pks" + elif "t3pks" not in type: + type = type + "-t3pks" + z = 1 + if i in t4pksprots: + if z == 0: + type = "t1pks" + elif "t1pks" not in type: + type = type + "-t1pks" + z = 1 + if i in nrpsprots: + if z == 0: + type = "nrps" + elif "nrps" not in type: + type = type + "-nrps" + z = 1 + if i in terpeneprots: + if z == 0: + type= "terpene" + elif "terpene" not in type: + type = type + "-terpene" + z = 1 + if i in lantprots: + if z == 0: + type= "lant" + elif "lant" not in type: + type = type + "-lant" + z = 1 + if i in bcinprots: + if z == 0: + type= "bcin" + elif "bcin" not in type: + type = type + "-bcin" + z = 1 + if i in lactamprots: + if z == 0: + type = "blactam" + elif "blactam" not in type: + type = type + "-blactam" + z = 1 + if i in amglyccyclprots: + if z == 0: + type = "amglyccycl" + elif "amglyccycl" not in type: + type = type + "-amglyccycl" + z = 1 + if i in siderophoreprots: + if z == 0: + type = "siderophore" + elif "siderophore" not in type: + type = type + "-siderophore" + z = 1 + if i in ectprots: + if z == 0: + type = "ectoine" + elif "ectoine" not in type: + type = type + "-ectoine" + z = 1 + if i in indoleprots: + if z == 0: + type = "indole" + elif "indole" not in type: + type = type + "-indole" + z = 1 + if i in nucleoprots: + if z == 0: + type = "nucleoside" + elif "nucleoside" not in type: + type = type + "-nucleoside" + z = 1 + if i in phosphoprots: + if z == 0: + type = "phosphoglycolipid" + elif "phosphoglycolipid" not in type: + type = type + "-phosphoglycolipid" + z = 1 + if i in butyrprots: + if z == 0: + type = "butyrolactone" + elif "butyrolactone" not in type: + type = type + "-butyrolactone" + z = 1 + if i in melaninprots: + if z == 0: + type = "melanin" + elif "melanin" not in type: + type = type + "-melanin" + z = 1 + if i in aminocoumarinprots: + if z == 0: + type = "aminocoumarin" + elif "aminocoumarin" not in type: + type = type + "-aminocoumarin" + z = 1 + if "other-" in type[:6]: + type = type[6:] + #Shorten gene cluster if type is among typically short gene cluster types + if cend > dnaseqlength: + cend = dnaseqlength + if type == "t3pks" or type == "t2pks": + if cstart != 0: + cstart = cstart + 5000 + if cend != dnaseqlength: + cend = cend - 5000 + clustergenes2 = [] + for i in clustergenes: + start = int(i[0]) + end = int(i[1]) + if (start > cstart and start < cend) or (end > cstart and end < cend): + clustergenes2.append(i) + clustergenes = clustergenes2 + if type == "bcin" or type == "siderophore" or type == "lant" or type == "terpene": + if cstart != 0: + cstart = cstart + 10000 + if cend != dnaseqlength: + cend = cend - 10000 + clustergenes2 = [] + for i in clustergenes: + start = int(i[0]) + end = int(i[1]) + if (start > cstart and start < cend) or (end > cstart and end < cend): + clustergenes2.append(i) + clustergenes = clustergenes2 + if type == "butyrolactone" or type == "melanin" or type == "ectoine": + if cstart != 0: + cstart = cstart + 17000 + if cend != dnaseqlength: + cend = cend - 17000 + clustergenes2 = [] + for i in clustergenes: + start = int(i[0]) + end = int(i[1]) + if (start > cstart and start < cend) or (end > cstart and end < cend): + clustergenes2.append(i) + clustergenes = clustergenes2 + #For all clustergenes, write info to fasta + for i in clustergenes: + start = str(i[0]) + end = str(i[1]) + strand = i[2] + seq = seqdict[i[4]] + ann = i[3].replace(" ","_") + accession = i[4] + name = nuccode + "|c" + str(a) + "|" + start + "-" + end + "|" + strand + "|" + accession + "|" + ann + fastafile.write(">" + name + "\n" + seq + "\n") + if accession not in geneclustergenes: + geneclustergenes.append(accession) + #Write gene cluster info to separate txt file + txtfile.write(nuccode + "\t" + nucname + "\t" + "c" + str(a) + "\t" + type + "\t") + ws0.write(a,0,genomic_accnr) + try: + ws0.write(a,1,nucname) + except: + ws0.write(a,1,"Name to long to be contained in Excel cell; see txt file in downloadable zip archive.") + ws0.write(a,2,type) + xlsgenesfield = "" + for i in clustergenes: + txtfile.write(i[4] + ";") + xlsgenesfield = xlsgenesfield + i[4] + ";" + txtfile.write("\t") + for i in clustergenes: + txtfile.write(accessiondict[i[4]] + ";") + xlsgenesfield = xlsgenesfield[:-1] + try: + ws0.write(a,3,xlsgenesfield) + except: + ws0.write(a,3,"Too many genes to be contained in Excel cell; see txt file in downloadable zip archive.") + txtfile.write("\n") + #Write gene cluster info to clusterinfo dictionary + for i in clustergenes: + if i[4] in allsecmetprots: + coregenes.append(i[4]) + allcoregenes.append(i[4]) + clusterinfo[clusternr] = [type,cstart,cend,coregenes,clustergenes] + #Close xls, fasta and txt files + fastafile.close() + txtfile.close() + + #Analysis of core PKS/NRPS genes (separate py), detect subgroups and predict specificities and final products + #Make list of PKS / NRPS gene clusters to be analysed + #print "Analysing core PKS/NRPS genes..." + logfile.write("Analysing core PKS/NRPS genes...\n") + pksnrpsgeneclusters = [] + pksnrpscoregenes = [] + for i in geneclusters: + if "t1pks" in clusterinfo[i][0] or "t4pks" in clusterinfo[i][0] or "transatpks" in clusterinfo[i][0] or "nrps" in clusterinfo[i][0]: + pksnrpsgeneclusters.append(i) + for i in t1pksprots: + pksnrpscoregenes.append(i) + for i in transatpksprots: + pksnrpscoregenes.append(i) + for i in t4pksprots: + pksnrpscoregenes.append(i) + for i in nrpsprots: + pksnrpscoregenes.append(i) + for i in amp_t_prots: + pksnrpscoregenes.append(i) + pksnrpsgenestartdict = {} + for i in pksnrpscoregenes: + start = int(genedict[i][0]) + pksnrpsgenestartdict[i] = start + pksnrpscoregenes = sortdictkeysbyvalues(pksnrpsgenestartdict) + nrpsnames = [] + nrpsseqs = [] + pksnrpsnames = [] + pksnrpsseqs = [] + pksnames = [] + pksseqs = [] + calnames = [] + calseqs = [] + krnames = [] + krseqs = [] + nrpspkstypedict = {} + domaindict = {} + if len(pksnrpscoregenes) > 0: + #Write PKS / NRPS core genes to FASTA file + for i in pksnrpscoregenes: + name = i + seq = seqdict[i] + pksnrpsnames.append(name) + pksnrpsseqs.append(seq) + writefasta(pksnrpsnames,pksnrpsseqs,genomename + "/nrpspks_proteins.fasta") + #Analyse for abMotifs + hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 0.1 -o " + genomename + "/nrpspks/abmotifshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/abmotifshmm.txt "+ hmms_path +"abmotifs.hmm " + genomename + "/nrpspks_proteins.fasta" + os.system(hmmsearch) + mhmmlengthsdict = hmmlengths(hmms_path+"abmotifs.hmm") + motifdict = hmmscanparse(genomename + "/nrpspks/abmotifshmm_output.txt",mhmmlengthsdict) + #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains + hmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/nrpspkshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/nrpspkshmm.txt "+ hmms_path +"nrpspksdomains.hmm " + genomename + "/nrpspks_proteins.fasta" + os.system(hmmsearch) + hmmlengthsdict = hmmlengths(hmms_path+"nrpspksdomains.hmm") + domaindict = hmmscanparse(genomename + "/nrpspks/nrpspkshmm_output.txt",hmmlengthsdict) + nrpspksdomainsfile = open(genomename + "/nrpspks/nrpspksdomains.txt","w") + #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types + kshmmsearch = hmmscan_path + " --cut_tc --cpu " + str(nrcpus) + " -o " + genomename + "/nrpspks/kshmm_output.txt" + " --noali --tblout " + genomename + "/nrpspks/kshmm.txt " + hmms_path + "ksdomains.hmm " + genomename + "/nrpspks_proteins.fasta" + os.system(kshmmsearch) + kshmmlengthsdict = hmmlengths(hmms_path+"ksdomains.hmm") + ksdomaindict = hmmscanparse(genomename + "/nrpspks/kshmm_output.txt",kshmmlengthsdict) + for k in pksnrpscoregenes: + #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.] + domainlist = [] + nrKSdomains = 0 + for i in domaindict[k]: + domainlist.append(i[0]) + if i[0] == "PKS_KS": + nrKSdomains += 1 + modKSscore = 0 + traKSscore = 0 + eneKSscore = 0 + iterKSscore = 0 + for i in ksdomaindict[k]: + if i[0] == "Trans-AT-KS": + traKSscore += 1 + if i[0] == "Modular-KS": + modKSscore += 1 + if i[0] == "Enediyne-KS": + eneKSscore += 1 + if i[0] == "Iterative-KS": + iterKSscore += 1 + for i in domaindict[k]: + if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: + type = "Glycopeptide NRPS" + elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: + type = "NRPS" + elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ("PKS_KS" in domainlist or "PKS_AT" in domainlist): + type = "Hybrid PKS-NRPS" + elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore: + type = "Type I Trans-AT PKS" + elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3: + type = "Type I Iterative PKS" + elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3: + type = "Type I Enediyne PKS" + elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ((modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3): + type = "Type I Modular PKS" + elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist) and "PKS_KS" in domainlist and "PKS_AT" in domainlist: + type = "PKS-like protein" + elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: + type = "NRPS-like protein" + else: + type = "PKS/NRPS-like protein" + nrpspkstypedict[k] = type + #Write data to output file + for k in pksnrpscoregenes: + j = domaindict[k] + l = motifdict[k] + nrpspksdomainsfile.write(">> " + k + "\n") + nrpspksdomainsfile.write(">> " + nrpspkstypedict[k] + "\n") + nrpspksdomainsfile.write("name\tstart\tend\te-value\tscore\n") + for i in j: + #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") + nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) ) + nrpspksdomainsfile.write("** Motifs: **\n") + for i in l: + #nrpspksdomainsfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") + nrpspksdomainsfile.write("%s\t%s\t%s\t%s\t%s\n" % (i[0], i[1], i[2], i[3], i[4]) ) + nrpspksdomainsfile.write("\n\n") + nrpspksdomainsfile.close() + + elapsed = (time.time() - starttime) + #print "5163Time since start: " + str(elapsed) + + #Predict NRPS A domain specificities with NRPSPredictor and Minowa et al. method + #print "Predicting NRPS A domain substrate specificities by NRPSPredictor" + logfile.write("Predicting NRPS A domain substrate specificities by NRPSPredictor\n") + #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor + for k in pksnrpscoregenes: + j = domaindict[k] + nr = 0 + for i in j: + if i[0] == "AMP-binding" or i[0] == "A-OX": + nr += 1 + start = int(i[1]) + end = int(i[2]) + 120 + seq = seqdict[k][start:end] + name = k + "_A" + str(nr) + nrpsnames.append(name) + nrpsseqs.append(seq) + if len(nrpsnames) > 0: + writefasta(nrpsnames,nrpsseqs,"NRPSPredictor2/nrpsseqs.fasta") + #nrpspredcommand = "perl nrpsSpecPredictor.pl nrpsseqs.fasta ../" + nrpspredictoroutputfolder + " ." #OLD NRPSPREDICTOR1 command + os.chdir("NRPSPredictor2/") + #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs + if sys.platform == ('win32'): + nrpspred2codecommand = 'nrpscodepred nrpsseqs.fasta input.sig nrpscodes.txt > nul' + if sys.platform == ('linux2'): + nrpspred2codecommand = 'python nrpscodepred.py nrpsseqs.fasta input.sig nrpscodes.txt > /dev/null' + os.system(nrpspred2codecommand) + #Run NRPSPredictor2 SVM + currentdir = os.getcwd() + if sys.platform == ('win32'): + nrpspred2command = 'java -Ddatadir="' + currentdir + '\\data" -cp build/NRPSpredictor2.jar;lib/java-getopt-1.0.13.jar;lib/Utilities.jar;lib/libsvm.jar org.roettig.NRPSpredictor2.NRPSpredictor2 -i input.sig -r ..\\' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1' + if sys.platform == ('linux2'): + nrpspred2command = './NRPSpredictor2.sh -i input.sig -r ../' + nrpspredictoroutputfolder + 'nrpspredictor2.out -s 1' + os.popen(nrpspred2command) + #Copy NRPSPredictor results + if sys.platform == ('win32'): + copycommand = 'copy/y nrpscodes.txt ..\\' + nrpspredictoroutputfolder.replace("/","\\") + ' > nul' + if sys.platform == ('linux2'): + copycommand = 'cp nrpscodes.txt ../' + nrpspredictoroutputfolder + " > /dev/null" + os.system(copycommand) + os.chdir("..") + elapsed = (time.time() - starttime) + #print "5206Time since start: " + str(elapsed) + # folgendes bis zum naechsten time braucht 500s, liegt wohl haupsaechlich an schlechtem minowa_A code + #Minowa method: extract AMP-binding domain, and run Minowa_A + if len(nrpsnames) > 0: + #print "Predicting NRPS A domain substrate specificities by Minowa et al. method\n" + logfile.write("Predicting NRPS A domain substrate specificities by Minowa et al. method") + nrpsnames2 = [] + nrpsseqs2 = [] + for k in pksnrpscoregenes: + j = domaindict[k] + nr = 0 + for i in j: + if i[0] in ["AMP-binding", "A-OX"]: + nr += 1 + start = int(i[1]) + end = int(i[2]) + seq = seqdict[k][start:end] + name = k + "_A" + str(nr) + nrpsnames2.append(name) + nrpsseqs2.append(seq) + writefasta(nrpsnames2,nrpsseqs2,minowanrpsoutputfolder + "nrpsseqs.fasta") + if sys.platform == ('win32'): + minowanrpscommand = "minowa_A ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt" + if sys.platform == ('linux2'): + minowanrpscommand = "python minowa_A.py ../" + minowanrpsoutputfolder + "nrpsseqs.fasta ../" + minowanrpsoutputfolder + "nrpspredoutput.txt" + os.chdir("Minowa/") + os.system(minowanrpscommand) + os.chdir("..") + + elapsed = (time.time() - starttime) + #print "5235Time since start: " + str(elapsed) + #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?) + for k in pksnrpscoregenes: + j = domaindict[k] + nr = 0 + for i in j: + if i[0] == "PKS_AT": + nr += 1 + start = int(i[1]) + end = int(i[2]) + seq = seqdict[k][start:end] + name = k + "_AT" + str(nr) + pksnames.append(name) + pksseqs.append(seq) + if len(pksnames) > 0: + writefasta(pksnames,pksseqs,pkssignatureoutputfolder + "pksseqs.fasta") + writefasta(pksnames,pksseqs,minowapksoutputfolder + "pksseqs.fasta") + #Run PKS signature analysis + elapsed = (time.time() - starttime) + #print "5254Time since start: " + str(elapsed) + print "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences" + logfile.write("Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences\n") + if sys.platform == ('win32'): + pkspredcommand = "PKS_analysis ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt" + if sys.platform == ('linux2'): + pkspredcommand = "python PKS_analysis.py ../" + pkssignatureoutputfolder + "pksseqs.fasta ../" + pkssignatureoutputfolder + "pkspredoutput.txt" + os.chdir("pkssignatures/") + os.system(pkspredcommand) + os.chdir("..") + #Minowa method: run Minowa_AT + elapsed = (time.time() - starttime) + #print "5266Time since start: " + str(elapsed) + print "Predicting PKS AT domain substrate specificities by Minowa et al. method" + logfile.write("Predicting PKS AT domain substrate specificities by Minowa et al. method\n") + if sys.platform == ('win32'): + minowapkscommand = "minowa_AT ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt" + if sys.platform == ('linux2'): + minowapkscommand = "python minowa_AT.py ../" + minowapksoutputfolder + "pksseqs.fasta ../" + minowapksoutputfolder + "pkspredoutput.txt" + os.chdir("Minowa/") + os.system(minowapkscommand) + os.chdir("..") + + #Predict PKS CAL domain specificities with Minowa et al. method + elapsed = (time.time() - starttime) + #print "5279Time since start: " + str(elapsed) + print "Predicting CAL domain substrate specificities by Minowa et al. method" + logfile.write("Predicting CAL domain substrate specificities by Minowa et al. method\n") + for k in pksnrpscoregenes: + j = domaindict[k] + nr = 0 + for i in j: + if i[0] == "CAL_domain": + nr += 1 + start = int(i[1]) + end = int(i[2]) + seq = seqdict[k][start:end] + name = k + "_CAL" + str(nr) + calnames.append(name) + calseqs.append(seq) + if len(calnames) > 0: + writefasta(calnames,calseqs,minowacaloutputfolder + "calseqs.fasta") + if sys.platform == ('win32'): + minowacalcommand = "minowa_CAL ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt" + if sys.platform == ('linux2'): + minowacalcommand = "python minowa_CAL.py ../" + minowacaloutputfolder + "calseqs.fasta ../" + minowacaloutputfolder + "calpredoutput.txt" + os.chdir("Minowa/") + os.system(minowacalcommand) + os.chdir("..") + + elapsed = (time.time() - starttime) + #print "5305Time since start: " + str(elapsed) + #Predict PKS KR domain stereochemistry using pattern as published in ClustScan + print "Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al." + logfile.write("Predicting PKS KR activity and stereochemistry using KR fingerprints from Starcevic et al.\n") + for k in pksnrpscoregenes: + j = domaindict[k] + nr = 0 + for i in j: + if i[0] == "PKS_KR": + nr += 1 + start = int(i[1]) + end = int(i[2]) + seq = seqdict[k][start:end] + name = k + "_KR" + str(nr) + krnames.append(name) + krseqs.append(seq) + if len(krnames) > 0: + writefasta(krnames,krseqs,kranalysisoutputfolder + "krseqs.fasta") + if sys.platform == ('win32'): + kranalysiscommand = "kr_analysis ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt" + if sys.platform == ('linux2'): + kranalysiscommand = "python kr_analysis.py ../" + kranalysisoutputfolder + "krseqs.fasta ../" + kranalysisoutputfolder + "krpredoutput.txt" + os.chdir("kr_analysis/") + os.system(kranalysiscommand) + os.chdir("..") + + #Read and parse all substrate specificity prediction output files + minowa_nrps_preds = {} + minowa_nrps_preds_details = {} + nrps_svm_preds = {} + nrps_svm_preds_details = {} + nrps_code_preds = {} + nrps_code_preds_details = {} + substratetransdict2 = {'pipecolate':'pip','fOHOrn':'orn','beta-Lys':'blys','5NhOrn':'orn','OHOrn':'orn','Aad':'Aaa','bOHTyr':'bht'} + if len(nrpsnames) > 0: + minowa_a_file = open(minowanrpsoutputfolder + "nrpspredoutput.txt","r") + minowa_a_file = minowa_a_file.read() + minowa_a_file = minowa_a_file.replace("\r","\n") + parts = minowa_a_file.split("\\\\\n")[1:] + for i in parts: + partlines = i.split("\n") + acc = partlines[0] + tophit = partlines[2].split("\t")[0] + if tophit in substratetransdict2.keys(): + tophit = substratetransdict2[tophit] + minowa_nrps_preds[acc] = tophit.lower() + minowa_nrps_preds_details[acc] = "Minowa HMM method A-domain
Substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "

\n\n" + nrpspredictorfile1 = open(nrpspredictoroutputfolder + "nrpspredictor2.out","r") + nrpspredictorfile2 = open(nrpspredictoroutputfolder + "nrpscodes.txt","r") + nrpspredictorfile1 = nrpspredictorfile1.read() + nrpspredictorfile1 = nrpspredictorfile1.replace("\r","\n") + lines = nrpspredictorfile1.split("\n")[1:-1] + for k in lines: + tabs = k.split("\t") + nrps_svm_preds[tabs[0]] = tabs[6] + nrps_svm_preds_details[tabs[0]] = " NRPSPredictor2 SVM prediction details:
\n8 Angstrom 34 AA code:
\n" + tabs[1] + "
\nPredicted physicochemical class:
\n" + tabs[3] + "
\nLarge clusters prediction:
\n" + tabs[4] + "
\nSmall clusters prediction:
\n" + tabs[5] + "
\nSingle AA prediction:
\n" + tabs[6] + "

\n\n" + nrpspredictorfile2 = nrpspredictorfile2.read() + nrpspredictorfile2 = nrpspredictorfile2.replace("\r","\n") + lines = nrpspredictorfile2.split("\n")[:-1] + for k in lines: + tabs = k.split("\t") + nrps_code_preds[tabs[0]] = tabs[1] + nrps_code_preds_details[tabs[0]] = " NRPSPredictor2 Stachelhaus code prediction:
\n" + tabs[1] + "

\n\n" + minowa_pks_preds_details = {} + minowa_pks_preds = {} + pks_code_preds ={} + pks_code_preds_details ={} + substratetransdict = {'Malonyl-CoA':'mal','Methylmalonyl-CoA':'mmal','Methoxymalonyl-CoA':'mxmal','Ethylmalonyl-CoA':'emal','Isobutyryl-CoA':'isobut','2-Methylbutyryl-CoA':'2metbut','trans-1,2-CPDA':'trans-1,2-CPDA','Acetyl-CoA':'Acetyl-CoA','Benzoyl-_CoA':'benz','Propionyl-CoA':'prop','3-Methylbutyryl-CoA':'3metbut','Ethylmalonyl-CoA':'Ethyl_mal','CE-Malonyl-CoA':'cemal','2-Rhyd-Malonyl-CoA':'2Rhydmal','CHC-CoA':'CHC-CoA','inactive':'inactive'} + if len(pksnames) > 0: + minowa_at_file = open(minowapksoutputfolder + "pkspredoutput.txt","r") + minowa_at_file = minowa_at_file.read() + minowa_at_file = minowa_at_file.replace("\r","\n") + parts = minowa_at_file.split("\\\\\n")[1:] + for i in parts: + partlines = i.split("\n") + acc = partlines[0] + if substratetransdict.has_key(partlines[2].split("\t")[0]): + tophit = substratetransdict[partlines[2].split("\t")[0]] + else: + tophit = "pk" + minowa_pks_preds[acc] = tophit + minowa_pks_preds_details[acc] = "Minowa HMM method AT-domain
Substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "

\n\n" + pkssignaturefile = open(pkssignatureoutputfolder + "pkspredoutput.txt","r") + pkssignaturefile = pkssignaturefile.read() + pkssignaturefile = pkssignaturefile.replace("\r","\n") + parts = pkssignaturefile.split("//\n")[1:] + for i in parts: + partlines = i.split("\n") + partlines2 = [] + for j in partlines: + if j != "": + partlines2.append(j) + partlines = partlines2 + acc = partlines[0].split("\t")[0] + if len(partlines) > 2: + tophit = (partlines[1].split("\t")[0]).split("__")[1] + pks_code_preds[acc] = tophit + codes = [] + prots = [] + scores = [] + for i in partlines[1:4]: + codes.append(i.split("\t")[0]) + prot = i.split("\t")[1] + prot = prot.replace("_AT"," (AT") + prot = prot.replace("__","): ") + prots.append(prot) + scores.append(i.split("\t")[2]) + if len(prots) >= 3: + pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)
\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)
\n" + codes[2] + " - " + prots[2] + " : (" + scores[2] + "% identity)

\n\n" + elif len(prots) == 2: + pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)
\n" + codes[1] + " - " + prots[1] + " : (" + scores[1] + "% identity)

\n\n" + elif len(prots) == 1: + pks_code_preds_details[acc] = "PKS Active Site Signature method
AT-domain substrate specificity prediction top hits:
\nCode:" + partlines[0].split("\t")[1] + "
\n" + codes[0] + " - " + prots[0] + " : (" + scores[0] + "% identity)

\n\n" + else: + pks_code_preds[acc] = "N/A" + pks_code_preds_details[acc] = "PKS Active Site Signature method
No AT-domain substrate specificity prediction hits above 40% identity.
\n\n" + minowa_cal_preds = {} + minowa_cal_preds_details = {} + if len(calnames) > 0: + minowa_cal_file = open(minowacaloutputfolder + "calpredoutput.txt","r") + minowa_cal_file = minowa_cal_file.read() + minowa_cal_file = minowa_cal_file.replace("\r","\n") + parts = minowa_cal_file.split("\\\\\n")[1:] + for i in parts: + partlines = i.split("\n") + acc = partlines[0] + tophit = partlines[2].split("\t")[0] + minowa_cal_preds[acc] = tophit + minowa_cal_preds_details[acc] = "Minowa HMM method
CAL-domain substrate specificity prediction top hits:
\n" + partlines[1] + "
\n" + partlines[2] + "
\n" + partlines[3] + "
\n" + partlines[4] + "

\n\n" + kr_activity_preds = {} + kr_stereo_preds = {} + if len(krnames) > 0: + krfile = open(kranalysisoutputfolder + "krpredoutput.txt","r") + krfile = krfile.read() + krfile = krfile.replace("\r","\n") + krlines = krfile.split("\n")[:-1] + for i in krlines: + tabs = i.split("\t") + kr_activity_preds[tabs[0]] = tabs[1] + kr_stereo_preds[tabs[0]] = tabs[2] + + #Combine substrate specificity predictions into consensus prediction + consensuspreds = {} + #available_smiles_parts = ['ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE','LEU','LYS','PHE','PRO','SER','THR','TRP','TYR','VAL','MET','ORN','ala','arg','asn','asp','cys','gln','glu','gly','his','ile','leu','lys','phe','pro','ser','thr','trp','tyr','val','met','orn','Ala','Arg','Asn','Asp','Cys','Gln','Glu','Gly','His','Ile','Leu','Lys','Phe','Pro','Ser','Thr','Trp','Tyr','Val','Met','Orn','MPRO','23DHB','34DHB','2HIVA','PGLY','DAB','BALA','AEO','4MHA','PICO','AAA','DHA','SCY','PIP','BMT','ADDS','mpro','23dhb','34dhb','2hiva','pgly','dab','bala','aeo','4mha','pico','aaa','dha','scy','pip','bmt','adds','Mpro','23Dhb','34Dhb','2Hiva','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','mal','mmal','omal','emal','nrp','pk'] + available_smiles_parts = ['GLY','ALA','VAL','LEU','ILE','MET','PRO','PHE','TRP','SER','THR','ASN','GLN','TYR','CYS','LYS','ARG','HIS','ASP','GLU','MPRO','ORN','PGLY','DAB','BALA','AEO','DHA','PIP','BMT','gly','ala','val','leu','ile','met','pro','phe','trp','ser','thr','asn','gln','tyr','cys','lys','arg','his','asp','glu','aaa','mpro','dhb','2hiva','orn','pgly','dab','bala','aeo','4mha','pico','phg','dha','scy','pip','bmt','adds','aad','abu','hiv','dhpg','bht','3-me-glu','4pPro','ala-b','ala-d','dht','Sal','tcl','lys-b','hpg','hyv-d','iva','vol','mal','mmal','mxmal','emal','nrp','pk','Gly','Ala','Val','Leu','Ile','Met','Pro','Phe','Trp','Ser','Thr','Asn','Gln','Tyr','Cys','Lys','Arg','His','Asp','Glu','Mpro','23Dhb','34Dhb','2Hiva','Orn','Pgly','Dab','Bala','Aeo','4Mha','Pico','Aaa','Dha','Scy','Pip','Bmt','Adds','DHpg','DHB','nrp','pk'] + for i in pksnrpscoregenes: + nra = 0 + nrat = 0 + nrcal = 0 + j = domaindict[i] + for k in j: + if k[0] == "PKS_AT": + nrat += 1 + preds = [] + preds.append(minowa_pks_preds[i + "_AT" + str(nrat)]) + preds.append(pks_code_preds[i + "_AT" + str(nrat)]) + cpred = "n" + for l in preds: + if preds.count(l) > 1: + if l in available_smiles_parts: + consensuspreds[i + "_AT" + str(nrat)] = l + else: + consensuspreds[i + "_AT" + str(nrat)] = "pk" + cpred = "y" + if cpred == "n": + consensuspreds[i + "_AT" + str(nrat)] = "pk" + if k[0] == "AMP-binding" or k[0] == "A-OX": + nra +=1 + preds = [] + preds.append(minowa_nrps_preds[i + "_A" + str(nra)]) + preds.append(nrps_svm_preds[i + "_A" + str(nra)]) + preds.append(nrps_code_preds[i + "_A" + str(nra)]) + cpred = "n" + for l in preds: + if preds.count(l) > 1: + if l in available_smiles_parts: + consensuspreds[i + "_A" + str(nra)] = l + else: + consensuspreds[i + "_A" + str(nra)] = "nrp" + cpred = "y" + if cpred == "n": + consensuspreds[i + "_A" + str(nra)] = "nrp" + if k[0] == "CAL_domain": + nrcal += 1 + if minowa_cal_preds[i + "_CAL" + str(nrcal)] in available_smiles_parts: + consensuspreds[i + "_CAL" + str(nrcal)] = minowa_cal_preds[i + "_CAL" + str(nrcal)] + else: + consensuspreds[i + "_CAL" + str(nrcal)] = "pk" + + #Write all prediction details to HTML files for each gene to be used as pop-up window + domainnamesdict = {} + for i in pksnrpscoregenes: + j = domaindict[i] + domainnames = [] + for k in j: + domainnames.append(k[0]) + domainnamesdict[i] = domainnames + for i in pksnrpscoregenes: + if "PKS_AT" in domainnamesdict[i] or "AMP-binding" in domainnamesdict[i] or "A-OX" in domainnamesdict[i] or "CAL_domain" in domainnamesdict[i]: + j = domaindict[i] + nrat = 0 + nra = 0 + nrcal = 0 + nrkr = 0 + for k in j: + if k[0] == "PKS_AT": + nrat += 1 + domainname = i + "_AT" + str(nrat) + htmloutfile = open(substrspecsfolder + domainname + ".html","w") + htmloutfile.write('\n\nPrediction details\n\n\n') + htmloutfile.write(minowa_pks_preds_details[domainname]) + htmloutfile.write(pks_code_preds_details[domainname]) + htmloutfile.write("Consensus Predictions: " + consensuspreds[domainname] + "") + htmloutfile.write('\n\n') + htmloutfile.close() + if k[0] == "AMP-binding" or k[0] == "A-OX": + nra += 1 + domainname = i + "_A" + str(nra) + htmloutfile = open(substrspecsfolder + domainname + ".html","w") + htmloutfile.write('\n\nPrediction details\n\n\n') + htmloutfile.write(nrps_svm_preds_details[domainname]) + htmloutfile.write(nrps_code_preds_details[domainname]) + htmloutfile.write(minowa_nrps_preds_details[domainname]) + htmloutfile.write("Consensus Prediction: '" + consensuspreds[domainname] + "'") + htmloutfile.write('\n\n') + htmloutfile.close() + if k[0] == "CAL_domain": + nrcal += 1 + domainname = i + "_CAL" + str(nrcal) + htmloutfile = open(substrspecsfolder + domainname + ".html","w") + htmloutfile.write('\n\nPrediction details\n\n\n') + htmloutfile.write(minowa_cal_preds_details[domainname]) + htmloutfile.write('\n\n') + htmloutfile.close() + + elapsed = (time.time() - starttime) + #print "5541Time since start: " + str(elapsed) + #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains + compound_pred_dict = {} + dockingdomainanalysis = [] + nrpspksclusters = [] + a = 1 + for i in geneclusters: + genecluster = i + clustercoregenes = clusterinfo[i][3] + clusterpksnrpsgenes = [] + for j in clustercoregenes: + if j in pksnrpscoregenes: + clusterpksnrpsgenes.append(j) + if len(clusterpksnrpsgenes) > 0: + nrpspksclusters.append(genecluster) + pksgenes = 0 + clusterpksgenes = [] + nrpsgenes = 0 + clusternrpsgenes = [] + hybridgenes = 0 + clusterhybridgenes = [] + for j in clusterpksnrpsgenes: + k = nrpspkstypedict[j] + if "PKS" in k and "NRPS" not in k: + pksgenes += 1 + clusterpksgenes.append(j) + elif "PKS" not in k and "NRPS" in k: + nrpsgenes += 1 + clusternrpsgenes.append(j) + elif "PKS/NRPS" in k: + if ("PKS_KS" in domainnamesdict[j] or "PKS_AT" in domainnamesdict[j]) and ("AMP-binding" not in domainnamesdict[j] and "A-OX" not in domainnamesdict[j] and "Condensation" not in domainnamesdict[j]): + pksgenes += 1 + clusterpksgenes.append(j) + elif ("PKS_KS" not in domainnamesdict[j] and "PKS_AT" not in domainnamesdict[j]) and ("AMP-binding" in domainnamesdict[j] or "A-OX" in domainnamesdict[j] or "Condensation" in domainnamesdict[j]): + nrpsgenes += 1 + clusternrpsgenes.append(j) + elif "PKS" in k and "NRPS" in k: + hybridgenes += 1 + clusterhybridgenes.append(j) + #If more than three PKS genes, use dock_dom_analysis if possible to identify order + dock_dom_analysis = "failed" + if pksgenes > 3 and nrpsgenes == 0 and hybridgenes == 0: + #print "Predicting PKS gene order by docking domain sequence analysis" + logfile.write("Predicting PKS gene order by docking domain sequence analysis") + dockhtmlfile = open(htmlfolder + "docking_analysis" + str(genecluster) + ".html","w") + #Find first and last genes based on starter module and TE / TD + startergene = "" + endinggene = "" + for k in clusterpksgenes: + if "Thioesterase" in domainnamesdict[k] or "TD" in domainnamesdict[k]: + if endinggene == "": + endinggene = k + else: + endinggene = "" + if len(domainnamesdict[k]) >=2 and "PKS_AT" == domainnamesdict[k][0] and "ACP" == domainnamesdict[k][1]: + if startergene == "": + startergene = k + else: + startergene = "" + if startergene == "": + for k in clusterpksgenes: + if len(domainnamesdict[k]) >=3 and "PKS_KS" == domainnamesdict[k][0] and "PKS_AT" == domainnamesdict[k][1] and "ACP" == domainnamesdict[k][2]: + if startergene == "": + startergene = k + else: + startergene = "" + break + #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues + ntermintresdict = {} + ntermnames = [] + ntermseqs = [] + for k in clusterpksgenes: + if k != startergene: + ntermnames.append(k) + seq = seqdict[k] + ntermseqs.append(seq[:50]) + ntermfasta = "docking_analysis/input.fasta" + z = 0 + for k in ntermnames: + writefasta([ntermnames[z]],[ntermseqs[z]],ntermfasta) + os.chdir("docking_analysis") + os.system("muscle -profile -quiet -in1 nterm.fasta -in2 input.fasta -out muscle.fasta") + intresidues = extractpositions("nterm.fasta","muscle.fasta",[2,15],"EryAIII_5_6_ref",ntermnames[z]) + ntermintresdict[ntermnames[z]] = intresidues + os.chdir("..") + z += 1 + #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues + ctermintresdict = {} + ctermnames = [] + ctermseqs = [] + for k in clusterpksgenes: + if k != endinggene: + ctermnames.append(k) + seq = seqdict[k] + ctermseqs.append(seq[-100:]) + ctermfasta = "docking_analysis/input.fasta" + z = 0 + for k in ctermnames: + writefasta([ctermnames[z]],[ctermseqs[z]],ctermfasta) + os.chdir("docking_analysis") + os.system("muscle -profile -quiet -in1 cterm.fasta -in2 input.fasta -out muscle.fasta") + intresidues = extractpositions("cterm.fasta","muscle.fasta",[55,64],"EryAII_ref",ctermnames[z]) + ctermintresdict[ctermnames[z]] = intresidues + os.chdir("..") + z += 1 + #If docking domains found in all, check for optimal order using interacting residues + genes_to_order = [] + z = 0 + for k in clusterpksgenes: + if k == startergene or k == endinggene: + pass + else: + genes_to_order.append(k) + z += 1 + possible_orders = list(itertools.permutations(genes_to_order,len(genes_to_order))) + hydrophobic = ["A","V","I","L","F","W","Y","M"] + positivecharge = ["H","K","R"] + negativecharge = ["D","E"] + other = ["C","G","P","S","T","N","Q","X","U"] + possible_orders_scoredict = {} + for k in possible_orders: + score = 0 + interactions = [] + z = 0 + for l in k[:-1]: + interactions.append([l,k[z + 1]]) + z += 1 + for l in interactions: + res1a = ctermintresdict[l[0]][0] + res1b = ntermintresdict[l[1]][0] + res2a = ctermintresdict[l[0]][1] + res2b = ntermintresdict[l[1]][1] + if (res1a in hydrophobic and res1b in hydrophobic) or (res1a in positivecharge and res1b in negativecharge) or (res1a in negativecharge and res1b in positivecharge): + score += 1 + if (res1a in positivecharge and res1b in positivecharge) or (res1a in negativecharge and res1b in negativecharge): + score = score - 1 + if (res2a in hydrophobic and res2b in hydrophobic) or (res2a in positivecharge and res2b in negativecharge) or (res2a in negativecharge and res2b in positivecharge): + score += 1 + if (res2a in positivecharge and res2b in positivecharge) or (res2a in negativecharge and res2b in negativecharge): + score = score - 1 + possible_orders_scoredict[k] = score + ranked_orders = sortdictkeysbyvaluesrev(possible_orders_scoredict) + ranked_orders_part = [] + ranked_orders2 = [] + a = 0 + ranked_orders_len = len(ranked_orders) - 1 + for i in ranked_orders: + if a == 0: + score = possible_orders_scoredict[i] + ranked_orders_part.append(i) + elif a == ranked_orders_len: + ranked_orders_part.append(i) + ranked_orders2 = ranked_orders2 + ranked_orders_part + else: + if possible_orders_scoredict[i] == score: + ranked_orders_part.append(i) + else: + ranked_orders_part.reverse() + ranked_orders2 = ranked_orders2 + ranked_orders_part + score = possible_orders_scoredict[i] + ranked_orders_part = [] + ranked_orders_part.append(i) + a += 1 + ranked_orders = ranked_orders2[:1000] + geneorders = ranked_orders + geneorders2 = [] + for l in geneorders: + geneorder = [] + if startergene != "": + geneorder.append(startergene) + [ geneorder.append(m) for m in l ] + #for m in l: + # geneorder.append(m) + if endinggene != "": + geneorder.append(endinggene) + geneorders2.append(geneorder) + geneorders = geneorders2 + if len(ranked_orders) == 1000: + dockhtmlfile.write('\n\n\n\n\nDocking domain analysis. Score for 1000 highest scoring gene orders:

\n') + else: + dockhtmlfile.write('\n\n\n\n\nDocking domain analysis. Scores for all possible gene orders:

\n') + dockhtmlfile.write('\n') + for l in geneorders: + string = "\n") + dockhtmlfile.write('\n
Gene order Score
" + for m in l: + string = string + m + "," + if startergene != "" and endinggene != "": + string = string[:-1] + " " + str(possible_orders_scoredict[tuple(l[1:-1])]) + elif startergene == "" and endinggene != "": + string = string[:-1] + " " + str(possible_orders_scoredict[tuple(l[:-1])]) + elif startergene != "" and endinggene == "": + string = string[:-1] + " " + str(possible_orders_scoredict[tuple(l[1:])]) + elif startergene == "" and endinggene == "": + string = string[:-1] + " " + str(possible_orders_scoredict[tuple(l)]) + dockhtmlfile.write(string + "
') + dockhtmlfile.close() + #print "Predicting PKS gene order by docking domain sequence analysis succeeded." + #Write html outfile with docking domain analysis output + # + logfile.write("Predicting PKS gene order by docking domain sequence analysis succeeded.") + dockingdomainanalysis.append(genecluster) + #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity + direction = 0 + for k in clusterpksnrpsgenes: + if strandsdict[k] == "+": + direction += 1 + elif strandsdict[k] == "-": + direction = direction - 1 + if direction < 0: + clusterpksnrpsgenes.reverse() + if "Thioesterase" in domainnamesdict[clusterpksnrpsgenes[0]] or "TD" in domainnamesdict[clusterpksnrpsgenes[0]]: + clusterpksnrpsgenes.reverse() + geneorder = clusterpksnrpsgenes + #Generate substrates order from predicted gene order and consensus predictions + prediction = "" + for k in geneorder: + domains = domainnamesdict[k] + nra = 0 + nrat = 0 + nrcal = 0 + for l in domains: + if "PKS_AT" in l: + nrat += 1 + prediction = prediction + consensuspreds[k + "_AT" + str(nrat)] + " " + if "AMP-binding" in l or "A-OX" in l: + nra += 1 + prediction = prediction + consensuspreds[k + "_A" + str(nra)] + " " + if "CAL_domain" in l: + nrcal += 1 + prediction = prediction + consensuspreds[k + "_CAL" + str(nrcal)] + " " + prediction = prediction[:-1] + compound_pred_dict[genecluster] = prediction + a += 1 + + #Combine predictions into a prediction of the final chemical structure and generate images + os.chdir("NRPeditor") + failedstructures = [] + for i in geneclusters: + genecluster = i + if compound_pred_dict.has_key(genecluster): + residues = compound_pred_dict[genecluster] + nrresidues = len(residues.split(" ")) + if nrresidues > 1: + if sys.platform == ('win32'): + structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' + elif sys.platform == ('linux2'): + structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' + for i in residues.split(" "): + structcommand = structcommand + i + " " + structcommand = structcommand + 'TE"' + smilesinfo = os.popen(structcommand) + smilesinfo = smilesinfo.read() + smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0] + if sys.platform == ('linux2'): + smiles_string.replace("[X]","[*:X]") + smiles_string2 = "" + a = 1 + for k in smiles_string: + if k == "X": + smiles_string2 = smiles_string2 + str(a) + a += 1 + else: + smiles_string2 = smiles_string2 + k + smiles_string = smiles_string2 + smilesfile = open("genecluster" + str(genecluster) + ".smi","w") + smilesfile.write(smiles_string) + smilesfile.close() + depictstatus = depict_smile(genecluster,structuresfolder) + if depictstatus == "failed": + failedstructures.append(genecluster) + elif clusterinfo[genecluster][0] == "ectoine": + smiles_string = "CC1=NCCC(N1)C(=O)O" + smilesfile = open("genecluster" + str(genecluster) + ".smi","w") + smilesfile.write(smiles_string) + smilesfile.close() + depictstatus = depict_smile(genecluster,structuresfolder) + if depictstatus == "failed": + failedstructures.append(genecluster) + elif genecluster in failedstructures: + del failedstructures[failedstructures.index(genecluster)] + compound_pred_dict[genecluster] = "ectoine " + os.chdir("..") + + elapsed = (time.time() - starttime) + #print "5826 Time since start: " + str(elapsed) + #ClusterBlast + if clusterblast == "y": + #Load gene cluster database into memory + #print "ClusterBlast: Loading gene clusters database into memory..." + logfile.write("ClusterBlast: Loading gene clusters database into memory...\n") + + os.chdir(genomename + "/clusterblast") + #file = open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r") + #filetext = file.read() + #lines = filetext.split("\n") + clusters = {} + #for i in open(os.path.join(antismash_path, "clusterblast/geneclusters.txt")): + bin_path = os.path.join(antismash_path, "clusterblast/geneclusters.bin") + if os.path.exists( bin_path ): + clusters = cPickle.load( open(bin_path) ) + #print clusters + else: + for line in open( os.path.join(antismash_path, "clusterblast/geneclusters.txt") ,"r"): + line = line.strip() + tabs = line.split("\t") + accession = tabs[0] + clusterdescription = tabs[1] + clusternr = tabs[2] + clustertype = tabs[3] + clustername = accession + "_" + clusternr + clustertags = tabs[4].split(";") + clusterprots = tabs[5].split(";") + clusters[clustername] = [clusterprots,clusterdescription,clustertype,clustertags] + cPickle.dump(clusters, open(bin_path, 'w'), -1) + #Load gene cluster database proteins info into memory + #print "ClusterBlast: Loading gene cluster database proteins into memory..." + logfile.write("ClusterBlast: Loading gene cluster database proteins into memory...\n") + #file = open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r") + #filetext = file.read() + #filetext = filetext.replace("\r","\n") + #lines = filetext.split("\n") + proteingeneclusters = {} + proteinlocations = {} + proteinstrands = {} + proteinannotations = {} + proteintags = {} + bin_path = os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta.bin") + if os.path.exists( bin_path ): + (proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags) = cPickle.load( open(bin_path, 'r') ) + else: + for line in open( os.path.join(antismash_path, "clusterblast/geneclusterprots.fasta") ,"r"): + line = line.replace('\n', '') + if line.startswith(">"): + tabs = line.split("|") + #print 'Protein:', tabs + protein = tabs[6] + locustag = tabs[4] + if accessiondict.has_key(locustag): + locustag = "h_" + locustag + proteintags[protein] = locustag + clustername = tabs[0] + "_" + tabs[1] + proteingeneclusters[protein] = clustername + location = tabs[2] + proteinlocations[protein] = location + strand = tabs[3] + proteinstrands[protein] = strand + annotation = tabs[5] + proteinannotations[protein] = annotation + cPickle.dump([proteingeneclusters, proteinlocations, proteinstrands, proteinannotations, proteintags], open(bin_path, 'w'), -1) + #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly + #print "Finding internal homologs in each gene cluster.." + logfile.write("Finding internal homologs in each gene cluster..\n") + internalhomologygroupsdict = {} + for i in geneclusters: + clusternumber = i + #Create input fasta files for BLAST search + queryclusterprotslist = clusterinfo[i][4] + queryclusterprots = [] + for i in queryclusterprotslist: + queryclusterprots.append(i[4]) + queryclusternames = [] + queryclusterseqs = [] + for i in queryclusterprots: + seq = seqdict[i] + name = fullnamedict[i] + queryclusterseqs.append(seq) + queryclusternames.append(name) + writefasta(queryclusternames,queryclusterseqs,"internal_input.fasta") + #Run and parse BLAST search + makeblastdbcommand = "makeblastdb -in internal_input.fasta -out internal_input.fasta -dbtype prot" + blastsearch = "blastp -db internal_input.fasta -query internal_input.fasta -outfmt 6 -max_target_seqs 1000 -evalue 1e-05 -out internal_input.out" + if "--gui" in sys.argv and sys.argv[sys.argv.index("--gui") + 1] == "y": + os.popen(makeblastdbcommand) + os.popen(blastsearch) + else: + os.system(makeblastdbcommand) + os.system(blastsearch) + #print "5920 makeblastdb finised" + blastoutput = open("internal_input.out","r").read() + minseqcoverage = 25 + minpercidentity = 30 + seqlengths = fastaseqlengths(proteins) + iblastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes) + iblastdict = iblastinfo[0] + iquerylist = iblastinfo[1] + #find and store internal homologs + groups = [] + for j in queryclusternames: + jsplit = j.split("|")[4] + if iblastdict.has_key(j): + hits = iblastdict[j][0] + group = [] + for k in hits: + if k[:2] == "h_": + group.append(k[2:]) + elif k.count("|") > 4: + group.append(k.split("|")[4]) + else: + group.append(k) + if jsplit not in group: + group.append( jsplit ) + x = 0 + for l in groups: + for m in group: + if m in l: + del groups[x] + [group.append(n) for n in l if n not in group] + #for n in l: + # if n not in group: + # group.append(n) + break + x += 1 + group.sort() + groups.append(group) + else: + groups.append([ jsplit ]) + internalhomologygroupsdict[clusternumber] = groups + + #Run BLAST on gene cluster proteins of each cluster and parse output + #print "5961 Running NCBI BLAST+ gene cluster searches.." + logfile.write("Running NCBI BLAST+ gene cluster searches..\n") + for i in geneclusters: + clusternumber = i + #print " Gene cluster " + str(clusternumber) + #Create input fasta files for BLAST search + queryclusterprotslist = clusterinfo[i][4] + queryclusterprots = [] + for i in queryclusterprotslist: + queryclusterprots.append(i[4]) + queryclusternames = [] + queryclusterseqs = [] + for i in queryclusterprots: + seq = seqdict[i] + name = fullnamedict[i] + queryclusterseqs.append(seq) + queryclusternames.append(name) + equalpartsizes = int(len(queryclusternames)/nrcpus) + for i in range(nrcpus): + if i == 0: + setnames = queryclusternames[:equalpartsizes] + setseqs = queryclusterseqs[:equalpartsizes] + elif i == (nrcpus - 1): + setnames = queryclusternames[(i*equalpartsizes):] + setseqs = queryclusterseqs[(i*equalpartsizes):] + else: + setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)] + setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)] + writefasta(setnames,setseqs,"input" + str(i) + ".fasta") + processes = [] + processnames = [] + for i in range(nrcpus): + processes.append(Process(target=runblast, args=["input" + str(i) + ".fasta"])) + [i.start() for i in processes] + time.sleep(10) + while True: + processrunning = "n" + for i in processes: + if i.is_alive(): + processrunning = "y" + if processrunning == "y": + time.sleep(5) + else: + break + [i.join() for i in processes] + blastoutput = "" + for i in range(nrcpus): + output = open("input" + str(i) + ".out","r") + output = output.read() + blastoutput = blastoutput + output + os.chdir("..") + blastoutputfile = open("./clusterblastoutput.txt","w") + blastoutputfile.write(blastoutput) + blastoutputfile.close() + os.chdir("clusterblast") + #print " Blast search finished. Parsing results..." + logfile.write(" Blast search finished. Parsing results...\n") + minseqcoverage = 25 + minpercidentity = 30 + seqlengths = fastaseqlengths(proteins) + blastinfo = blastparse(blastoutput,minseqcoverage,minpercidentity,seqlengths,geneclustergenes) + blastdict = blastinfo[0] + querylist = blastinfo[1] + #Remove queries without hits + querylist2 = [] + for i in querylist: + if blastdict.has_key(i): + querylist2.append(i) + else: + pass + querylist = querylist2 + hitclusters = blastinfo[2] + #Score BLAST output on all gene clusters + #Rank gene cluster hits based on 1) number of protein hits covering >25% sequence length or at least 100aa alignment, with >30% identity and 2) cumulative blast score + #Find number of protein hits and cumulative blast score for each gene cluster + #print " Scoring Blast outputs on database of gene clusters..." + logfile.write(" Scoring Blast outputs on database of gene clusters...\n") + hitclusterdict = {} + hitclusterdata = {} + for i in hitclusters: + hitclusterdatalist = [] + nrhits = float(0) + nrcoregenehits = float(0) + cumblastscore = float(0) + hitpositions = [] + hitposcorelist = [] + for j in querylist: + querynrhits = 0 + querycumblastscore = float(0) + nrhitsplus = "n" + for k in blastdict[j][0]: + if i == blastdict[j][1][k][0]: + if [querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])] not in hitpositions: + nrhitsplus = "y" + querynrhits += 1 + blastscore = float(blastdict[j][1][k][6]) / 1000000 + querycumblastscore = querycumblastscore + blastscore + hitclusterdatalist.append([j,k,blastdict[j][1][k][5],blastdict[j][1][k][6],blastdict[j][1][k][7],blastdict[j][1][k][8]]) + hitclusterdata[i] = hitclusterdatalist + hitpositions.append([querylist.index(j),clusters[i][0].index(blastdict[j][1][k][9])]) + if nrhitsplus == "y": + nrhits += 1 + if j.split("|")[4] in allcoregenes: + nrcoregenehits += 0.1 + for hit in range(querynrhits): + hitposcorelist.append(1) + else: + for hit in range(querynrhits): + hitposcorelist.append(0) + cumblastscore = cumblastscore + float(querycumblastscore) + query_givenscores_querydict = {} + query_givenscores_hitdict = {} + #Find groups of hits + hitgroupsdict = {} + for p in hitpositions: + if not hitgroupsdict.has_key(p[0]): + hitgroupsdict[p[0]] = [p[1]] + else: + hitgroupsdict[p[0]].append(p[1]) + #Calculate synteny score; give score only if more than one hits (otherwise no synteny possible), and only once for every query gene and every hit gene + synteny_score = 0 + z = 1 + if nrhits > 1: + for p in hitpositions[:-1]: + tandem = "n" + #Check if a gene homologous to this gene has already been scored for synteny in the previous entry + if p[1] in hitgroupsdict[hitpositions[z][0]]: + tandem = "y" + #Score entry + if ((not query_givenscores_querydict.has_key(p[0])) or query_givenscores_querydict[p[0]] == 0) and ((not query_givenscores_hitdict.has_key(p[1])) or query_givenscores_hitdict[p[1]] == 0) and tandem == "n": + q = hitpositions[z] + if (abs(p[0] - q[0]) < 2) and abs(p[0]-q[0]) == abs(p[1]-q[1]): + synteny_score += 1 + if hitposcorelist[z - 1] == 1 or hitposcorelist[z] == 1: + synteny_score += 1 + query_givenscores_querydict[p[0]] = 1 + query_givenscores_hitdict[p[1]] = 1 + else: + query_givenscores_querydict[p[0]] = 0 + query_givenscores_hitdict[p[1]] = 0 + z += 1 + #Give bonus to gene clusters with >0 core gene hits + if nrcoregenehits > 0: + corebonus = 3 + else: + corebonus = 0 + #sorting score is based on number of hits (discrete values) & cumulative blast score (behind comma values) + sortingscore = nrhits + synteny_score + corebonus + nrcoregenehits + cumblastscore + hitclusterdict[i] = sortingscore + #Sort gene clusters + rankedclusters = sortdictkeysbyvaluesrev(hitclusterdict) + rankedclustervalues = sortdictkeysbyvaluesrevv(hitclusterdict) + #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters + #print " Writing output file..." + logfile.write(" Writing output file...\n") + #os.chdir("..") + #os.chdir(genomename) + #os.chdir("clusterblast") + out_file = open("cluster" + str(clusternumber) + ".txt","w") + out_file.write("ClusterBlast scores for " + infile) + out_file.write("\n\nTable of genes, locations, strands and annotations of query cluster:\n") + #out_file.write("\n") + #out_file.write("Table of genes, locations, strands and annotations of query cluster:") + #out_file.write("\n") + for i in queryclusterprots: + out_file.write("%s\t%s\t%s\t%s\t%s\t\n" % (i, proteins[3][i][0], proteins[3][i][1], proteins[3][i][2], proteins[3][i][3])) + """out_file.write(i) + out_file.write("\t") + out_file.write(proteins[3][i][0]) + out_file.write("\t") + out_file.write(proteins[3][i][1]) + out_file.write("\t") + out_file.write(proteins[3][i][2]) + out_file.write("\t") + out_file.write(proteins[3][i][3]) + out_file.write("\t") + out_file.write("\n")""" + out_file.write("\n\nSignificant hits: \n") + #out_file.write("\n") + #out_file.write("Significant hits: ") + #out_file.write("\n") + z = 0 + for i in rankedclusters[:100]: + #out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1]) + #out_file.write("\n") + out_file.write("%s. %s\t%s\n" % ((z+1), i, clusters[i][1]) ) + z += 1 + out_file.write("\n\n") + #out_file.write("\n") + z = 0 + out_file.write("Details:") + for i in rankedclusters[:100]: + value = str(rankedclustervalues[z]) + nrhits = value.split(".",1)[0] + if nrhits > 0: + cumblastscore = str(int(float(value.split(".")[1]))) + out_file.write("\n\n>>\n\n%s. %s\nSource: %s\nType: %s\nNumber of proteins with BLAST hits to this cluster: %s\nCumulative BLAST score: %s\n\nTable of genes, locations, strands and annotations of subject cluster:\n" % (z+1, i, clusters[i][1], clusters[i][2], nrhits, cumblastscore)) + clusterproteins = clusters[i][0] + #print 'clusterproteins\n\n', clusterproteins + """out_file.write("\n\n") + out_file.write(">>") + out_file.write("\n") + cumblastscore = str(int(float(value.split(".")[1]))) + out_file.write("\n") + out_file.write(str(z+1) + ". " + i) + out_file.write("\n") + out_file.write("Source: " + clusters[i][1]) + out_file.write("\n") + out_file.write("Type: " + clusters[i][2]) + out_file.write("\n") + out_file.write("Number of proteins with BLAST hits to this cluster: " + nrhits) + out_file.write("\n") + out_file.write("Cumulative BLAST score: " + cumblastscore) + out_file.write("\n") + out_file.write("\n") + out_file.write("Table of genes, locations, strands and annotations of subject cluster:") + out_file.write("\n") + clusterproteins = clusters[i][0]""" + + for j in clusterproteins: + #print '##########asdfasdf######', j, '---'+proteinlocations.keys()[0]+ '---', proteinannotations.has_key(j), proteinstrands.has_key(j), proteinlocations.has_key(j) + if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j): + if proteintags[j] == "no_locus_tag": + out_file.write(j) + else: + out_file.write(proteintags[j]) + out_file.write( "\t%s\t%s\t%s\t%s\t%s\n" % (j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]) ) + """out_file.write("\t") + out_file.write(j) + out_file.write("\t") + out_file.write(proteinlocations[j].split("-")[0]) + out_file.write("\t") + out_file.write(proteinlocations[j].split("-")[1]) + out_file.write("\t") + out_file.write(proteinstrands[j]) + out_file.write("\t") + out_file.write(proteinannotations[j]) + out_file.write("\n") + """ + + out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n") + if i in hitclusterdata.keys(): + tabledata = hitclusterdata[i] + for x in tabledata: + w = 0 + for y in x: + if w == 0: + out_file.write( "%s\t" % y.split("|")[4] ) + #out_file.write("\t") + w += 1 + else: + out_file.write("%s\t" % y) + #out_file.write("\t") + out_file.write("\n") + else: + "data not found" + out_file.write("\n") + out_file.write("\n") + z += 1 + #os.chdir("..") + #os.chdir("..") + #os.chdir("clusterblast") + os.chdir("..") + out_file.close() + + elapsed = (time.time() - starttime) + #print "Time since start: " + str(elapsed) + #smCOG analysis + smcogtreedict = {} + if smcogs == "y": + #print "Performing smCOG analysis" + logfile.write("Performing smCOG analysis\n") + hmmsearch = hmmscan_path + " --cpu " + str(nrcpus) + " -E 1E-6 -o " + "./smcogs/smcogshmm_output.txt" + " --noali --tblout " + "./smcogs/smcogshmm.txt "+ hmms_path +"smcogs.hmm " + "./clusterblast/geneclusterprots.fasta" + #print hmmsearch + os.system(hmmsearch) + #print 'finised' + smcoghmmlengthsdict = hmmlengths(hmms_path+"smcogs.hmm") + smcogdict = hmmscanparse("./smcogs/smcogshmm_output.txt", smcoghmmlengthsdict) + smcogdict2 = {} + for i in smcogdict.keys(): + newkey = i.split("|")[4] + smcogdict2[newkey] = smcogdict[i] + smcogdict = smcogdict2 + #Write output + #os.chdir(genomename) + os.chdir("smcogs") + smcogfile = open("smcogs.txt","w") + for k in geneclustergenes: + if k not in pksnrpscoregenes: + l = smcogdict[k] + smcogfile.write(">> " + k + "\n") + smcogfile.write("name\tstart\tend\te-value\tscore\n") + smcogfile.write("** smCOG hits **\n") + for i in l: + smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") + smcogfile.write("\n\n") + smcogfile.close() + os.chdir("..") + os.chdir("..") + #smCOG phylogenetic tree construction + #print "Calculating and drawing phylogenetic trees of cluster genes with smCOG members" + logfile.write("Calculating and drawing phylogenetic trees of cluster genes with smCOG members") + os.chdir("smcogtree") + smcoganalysisgenes = [] + #for k in geneclustergenes: + # if k not in pksnrpscoregenes: + # smcoganalysisgenes.append(k) + [smcoganalysisgenes.append(k) for k in geneclustergenes if k not in pksnrpscoregenes] + smcogsets = [] + equalpartsizes = int(len(smcoganalysisgenes)/nrcpus) + for i in range(nrcpus): + if i == 0: + geneslist = smcoganalysisgenes[:equalpartsizes] + elif i == (nrcpus - 1): + geneslist = smcoganalysisgenes[(i*equalpartsizes):] + else: + geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)] + smcogsets.append(geneslist) + processes = [] + processnames = [] + z = 0 + for k in smcogsets: + processes.append(Process(target=smcog_analysis, args=[k,z,accessiondict,seqdict,smcogdict,smcogsoutputfolder])) + z += 1 + for k in processes: + k.start() + time.sleep(1) + while True: + processrunning = "n" + for k in processes: + if k.is_alive(): + processrunning = "y" + if processrunning == "y": + time.sleep(5) + else: + break + for k in processes: + k.join() + os.chdir("..") + currentpath = os.getcwd() + os.chdir(smcogsoutputfolder) + dircontents = getdircontents() + for k in dircontents: + #POTENTIAL pERFORMANCE gainfor k in glob.glob('*.png'): + if ".png" in k: + tag = k.split(".png")[0] + smcogtreedict[tag] = tag + ".png" + os.chdir(currentpath) + + + ##Visualization + #Read in ClusterBlast data + #Read in PubMed / PubChem links of database gene clusters + if clusterblast == "y": + if genomename in os.getcwd(): + os.chdir('..') + pubmed_dict = {} + pubchem_dict = {} + known_compound_dict = {} + #pubfile = open(antismash_path + "pubmed_pubchem_links.txt","r") + #pubfile = pubfile.read() + #publines = pubfile.split("\n") + #for i in publines: + bin_path = os.path.join(antismash_path, "pubmed_pubchem_links.bin") + if os.path.exists( bin_path ): + (pubmed_dict, pubchem_dict, known_compound_dict) = cPickle.load( open(bin_path) ) + else: + for line in open(antismash_path + "pubmed_pubchem_links.txt","r"): + line = line.replace('\n', '') + tabs = line.split("\t") + acc = tabs[0] + if tabs[1] != "": + pubmed_dict[acc] = tabs[1] + if tabs[2] != "": + pubchem_dict[acc] = tabs[2] + if tabs[3] != "": + known_compound_dict[acc] = tabs[3] + cPickle.dump([pubmed_dict, pubchem_dict, known_compound_dict], open(bin_path, 'w'), -1) + #print "Writing visualization SVGs and XHTML" + logfile.write("Writing visualization SVGs and XHTML\n") + queryclusterdata = {} + nrhitgeneclusters = {} + cblastclusternr = 1 + #print os.getcwd() + if clusterblast == "y": + for x in geneclusters: + clusterblastfile = open(clusterblastoutputfolder + "cluster" + str(x) + ".txt","r") + #print clusterblastfile + clusterblastfile = clusterblastfile.read() + clusterblastfile = clusterblastfile.replace("\r","\n") + toptenhitclusters = [] + #Identify top ten hits for visualization + hitlines = ((clusterblastfile.split("Significant hits: \n")[1]).split("\nDetails:")[0]).split("\n") + #print '\n\n#######hitlines\n', hitlines + a = 0 + cb_accessiondict = {} + b = 1 + for i in hitlines: + if " " in i: + cb_accessiondict[b] = (i.split("\t")[0]).split(" ")[1] + if genomic_accnr == "" or genomic_accnr not in i: + b += 1 + if a < 10: + if len(i) < 80: + toptenhitclusters.append(i) + elif len(i) >= 80: + j = i[0:77] + "..." + toptenhitclusters.append(j) + a += 1 + #print clusterblastfile + details = (clusterblastfile.split("\nDetails:")[1]).split(">>")[1:] + #print details + nrhitclusters = len(toptenhitclusters) + #Save query gene cluster data + querylines = ((clusterblastfile.split("Table of genes, locations, strands and annotations of query cluster:\n")[1]).split("\n\n\nSignificant hits:")[0]).split("\n") + queryclustergenes = [] + queryclustergenesdetails = {} + for i in querylines: + tabs = i.split("\t") + queryclustergenes.append(tabs[0]) + queryclustergenesdetails[tabs[0]] = [tabs[1],tabs[2],tabs[3],tabs[4]] + #For every gene cluster, store hit genes and details + colorgroupsdict = {} + hitclusterdata = {} + hitclusternr = 1 + compound_found = "n" + nrhitgeneclusters[x] = 0 + for i in details: + hitclustergenes = [] + hitclustergenesdetails = {} + #Only calculate for first ten hit gene clusters + if genomic_accnr == "" or genomic_accnr not in i: + if hitclusternr <= 10: + nrhitgeneclusters[x] = hitclusternr + accession = cb_accessiondict[hitclusternr] + hitclustergeneslines = ((i.split("Table of genes, locations, strands and annotations of subject cluster:\n")[1]).split("\n\nTable of Blast hits ")[0]).split("\n") + #print '***********\n', i, '\n' + #print hitclustergeneslines + for j in hitclustergeneslines: + tabs = j.split("\t") + hitclustergenes.append(tabs[0]) + hitclustergenesdetails[tabs[0]] = [tabs[2],tabs[3],tabs[4],tabs[5],tabs[1]] + + blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n") + querygeneswithhits = [] + coregeneswithhits = [] + + + blasthitdict = {} + blastdetailsdict = {} + querygenes = [] + revblasthitdict = {} + hitgenes = [] + + + for k in blasthitslines: + tabs = k.split("\t") + if tabs[0] not in querygeneswithhits: + querygeneswithhits.append(tabs[0]) + if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits: + coregeneswithhits.append(tabs[0]) + + + if blasthitdict.has_key(tabs[0]): + hits = blasthitdict[tabs[0]] + hits.append(tabs[1]) + blasthitdict[tabs[0]] = hits + if revblasthitdict.has_key(tabs[1]): + revhits = revblasthitdict[tabs[1]] + revhits.append(tabs[0]) + revblasthitdict[tabs[1]] = revhits + else: + revblasthitdict[tabs[1]] = [tabs[0]] + blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]] + if tabs[0] not in querygenes: + querygenes.append(tabs[0]) + hitgenes.append(tabs[1]) + else: + blasthitdict[tabs[0]] = [tabs[1]] + if revblasthitdict.has_key(tabs[1]): + revhits = revblasthitdict[tabs[1]] + revhits.append(tabs[0]) + revblasthitdict[tabs[1]] = revhits + else: + revblasthitdict[tabs[1]] = [tabs[0]] + blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]] + if tabs[0] not in querygenes: + querygenes.append(tabs[0]) + hitgenes.append(tabs[1]) + + + + for k in known_compound_dict.keys(): + if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0: + ws0.write(x,4,known_compound_dict[k]) + compound_found = "y" + """blasthitdict = {} + blastdetailsdict = {} + querygenes = [] + revblasthitdict = {} + hitgenes = [] + for i in blasthitslines: + tabs = i.split("\t") + if blasthitdict.has_key(tabs[0]): + hits = blasthitdict[tabs[0]] + hits.append(tabs[1]) + blasthitdict[tabs[0]] = hits + if revblasthitdict.has_key(tabs[1]): + revhits = revblasthitdict[tabs[1]] + revhits.append(tabs[0]) + revblasthitdict[tabs[1]] = revhits + else: + revblasthitdict[tabs[1]] = [tabs[0]] + blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]] + if tabs[0] not in querygenes: + querygenes.append(tabs[0]) + hitgenes.append(tabs[1]) + else: + blasthitdict[tabs[0]] = [tabs[1]] + if revblasthitdict.has_key(tabs[1]): + revhits = revblasthitdict[tabs[1]] + revhits.append(tabs[0]) + revblasthitdict[tabs[1]] = revhits + else: + revblasthitdict[tabs[1]] = [tabs[0]] + blastdetailsdict[tabs[0] + "_|_|_" + tabs[1]] = [tabs[5],tabs[3]] + if tabs[0] not in querygenes: + querygenes.append(tabs[0]) + hitgenes.append(tabs[1]) + """ + #Make groups of genes for coloring + colorgroups = [] + internalgroups = internalhomologygroupsdict[x] + for i in internalgroups: + querygenes_and_hits = [] + for j in i: + #Make list of query gene and its hits + additionalhits = [] + #For each hit, check if it was also hit by another gene; if so, only add it to the group if this hit had the lowest blast score + otherscores = [] + queryscore = 0 + if blasthitdict.has_key(j): + for k in blasthitdict[j]: + for l in blastdetailsdict.keys(): + if k in l and j in l: + queryscore = blastdetailsdict[l][1] + elif k in l and j not in l: + otherscores.append(blastdetailsdict[l][1]) + allscores = otherscores + [queryscore] + if queryscore == max(allscores): + additionalhits.append(k) + #Add additional hits to the querygenes_and_hits list that will form a colorgroup + querygenes_and_hits = querygenes_and_hits + additionalhits + if j not in querygenes_and_hits: + querygenes_and_hits.append(j) + if len(querygenes_and_hits) > 0: + colorgroups.append(querygenes_and_hits) + colorgroupsdict[hitclusternr] = colorgroups + hitclusterdata[hitclusternr] = [colorgroupsdict,hitclustergenes,hitclustergenesdetails,queryclustergenes,queryclustergenesdetails,toptenhitclusters,accession] + hitclusternr += 1 + elif hitclusternr > 10 and hitclusternr <= 50: + blasthitslines = ((i.split("%coverage, e-value):\n")[1]).split("\n\n")[0]).split("\n") + querygeneswithhits = [] + coregeneswithhits = [] + for k in blasthitslines: + tabs = k.split("\t") + if tabs[0] not in querygeneswithhits: + querygeneswithhits.append( tabs[0] ) + if tabs[0] in allcoregenes and tabs[0] not in coregeneswithhits: + coregeneswithhits.append(tabs[0]) + for k in known_compound_dict.keys(): + if k in i and compound_found == "n" and len(querygeneswithhits) > 2 and len(coregeneswithhits) > 0: + ws0.write(x,4,known_compound_dict[k]) + compound_found = "y" + hitclusternr += 1 + queryclusterdata[cblastclusternr] = [nrhitclusters,hitclusterdata] + cblastclusternr += 1 + wb.save(genomename + "/" + genomename + ".geneclusters.xls") + #Gather and store data on each gene cluster + gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102'] + transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288'] + qgeneclusterdata = {} + if smcogs == "y": + smcogdict2 = {} + smcogdescriptions = {} + for i in smcogdict.keys(): + if len(smcogdict[i]) > 0 and len(smcogdict[i][0]) > 0 and ":" in smcogdict[i][0][0]: + smcogdict2[i] = (smcogdict[i][0][0]).split(":")[0] + smcogdescriptions[(smcogdict[i][0][0]).split(":")[0]] = (smcogdict[i][0][0]).split(":")[1] + elif len(smcogdict[i]) > 0: + smcogdict2[i] = smcogdict[i][0][0] + smcogdict = smcogdict2 + for genecluster in geneclusters: + clustergenes = clusterinfo[genecluster][4] + clustergenes2 = [] + #for i in clustergenes: + # clustergenes2.append(i[4]) + [clustergenes2.append(i[4]) for i in clustergenes] + clustergenes = clustergenes2 + clusternr = 1 + clustertype = clusterinfo[genecluster][0] + annotations = {} + colors = [] + starts = [] + ends = [] + strands = [] + pksnrpsprots = [] + gtrs = [] + transporters = [] + for j in clustergenes: + annotations[j] = proteins[3][j][3] + starts.append(int(proteins[3][j][0])) + ends.append(int(proteins[3][j][1])) + strands.append(proteins[3][j][2]) + if j in allcoregenes: + colors.append("#810E15") + else: + colors.append("grey") + if j in pksnrpscoregenes: + pksnrpsprots.append(j) + if smcogs == "y": + if smcogdict.has_key(j) and len(smcogdict[j]) > 0 : + if smcogdict[j][0] in gtrcoglist: + gtrs.append(j) + if smcogdict[j][0] in transportercoglist: + transporters.append(j) + clustersize = max(ends) - min(starts) + if clusterblast == "n": + nrhitgeneclusters = {} + for i in geneclusters: + nrhitgeneclusters[i] = 0 + hitgeneclusters = range(1,(nrhitgeneclusters[genecluster] + 1)) + hitgeneclusterdata = {} + hitgeneclusterdata[genecluster] = [hitgeneclusters] + pksnrpsprotsnames = nrpspkstypedict + pksnrpsdomains = {} + domlist = [] + domsdetails = {} + substrspecnrpspredictordict = {} + substrspecminowadict = {} + substrspecpkssigdict = {} + substrspecconsensusdict = {} + krpredictionsdict = {} + for i in pksnrpsprots: + domlist = [] + domsdetails = {} + doms = domaindict[i] + for j in doms: + nr = 1 + while j[0] + str(nr) in domlist: + nr += 1 + domname = j[0] + str(nr) + domlist.append(domname) + domsdetails[domname] = [j[1],j[2]] + if "AMP-binding" in domname or "A-OX" in domname: + domname2 = i + "_" + "A" + str(nr) + substrspecminowadict[domname2] = minowa_nrps_preds[i + "_A" + str(nr)] + substrspecnrpspredictordict[domname2] = [nrps_code_preds[i + "_A" + str(nr)],nrps_svm_preds[i + "_A" + str(nr)]] + substrspecconsensusdict[domname2] = consensuspreds[i + "_A" + str(nr)] + if "PKS_AT" in domname: + domname2 = i + "_" + "AT" + str(nr) + substrspecminowadict[domname2] = minowa_pks_preds[i + "_AT" + str(nr)] + substrspecpkssigdict[domname2] = pks_code_preds[i + "_AT" + str(nr)] + substrspecconsensusdict[domname2] = consensuspreds[i + "_AT" + str(nr)] + if "CAL_domain" in domname: + domname2 = i + "_" + "CAL" + str(nr) + substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)] + substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)] + if "CAL_domain" in domname: + domname2 = i + "_" + "CAL" + str(nr) + substrspecminowadict[domname2] = minowa_cal_preds[i + "_CAL" + str(nr)] + substrspecconsensusdict[domname2] = consensuspreds[i + "_CAL" + str(nr)] + if "PKS_KR" in domname: + domname2 = i + "_" + "KR" + str(nr) + krpredictionsdict[domname2] = [kr_activity_preds[i + "_KR" + str(nr)],kr_stereo_preds[i + "_KR" + str(nr)]] + pksnrpsdomains[i] = [domlist,domsdetails] + if compound_pred_dict.has_key(genecluster): + structpred = compound_pred_dict[genecluster] + else: + structpred = "N/A" + qgeneclusterdata[genecluster] = [clustertype,clustersize,clustergenes,annotations,starts,ends,strands,pksnrpsprots,pksnrpsprotsnames,pksnrpsdomains,substrspecnrpspredictordict,substrspecminowadict,substrspecpkssigdict,substrspecconsensusdict,gtrs,transporters,colors,hitgeneclusterdata,structpred,krpredictionsdict] + + #Create genecluster svg for each gene cluster + geneposdict = {} + for qclusternr in geneclusters: + data = qgeneclusterdata[qclusternr] + #Some of the below 23 lines may already be internal to script, scan to remove unnecessary data fetching + clustertype = data[0] + clustersize = data[1] + genes = data[2] + annotations = data[3] + starts = data[4] + ends = data[5] + strands = data[6] + pksnrpsprots = data[7] + pksnrpsprotsnames = data[8] + pksnrpsdomains = data[9] + substrspecnrpspredictordict = data[10] + substrspecminowadict = data[11] + substrspecpkssigdict = data[12] + substrspecconsensusdict = data[13] + gtrs = data[14] + transporters = data[15] + colors = data[16] + hitgeneclusterdata = data[17] + structpred = data[18] + krpredictionsdict = data[19] + relpositions = relativepositions(starts,ends,clustersize) + rel_starts = relpositions[0] + rel_ends = relpositions[1] + y = 0 + for i in genes: + geneposdict[i] = [starts[y],ends[y]] + y += 1 + s = geneclustersvg(genes,rel_starts,rel_ends,strands,geneposdict,pksnrpsprots,pksnrpsdomains,qclusternr) + outfile = open(svgfolder + "genecluster" + str(qclusternr) + ".svg","w") + outfile.write(s.getXML()) + outfile.close() + #Create ClusterBlast svg + if clusterblast == "y": + clusterblastpositiondata = {} + #Create alignment svg for each pair of hit&query + for i in geneclusters: + hitclusters = range(queryclusterdata[i][0] + 1)[1:] + #Create svgs for pairwise gene cluster alignment + colorschemedict,rgbcolorscheme = calculate_colorgroups(i,hitclusters,queryclusterdata,internalhomologygroupsdict) + for k in hitclusters: + cresults = clusterblastresults(i,[k],queryclusterdata,colorschemedict,rgbcolorscheme) + s = cresults[0] + clusterblastpositiondata[str(i) + "_"+str(k)] = cresults[1] + outfile = open(svgfolder + "clusterblast" + str(i) + "_" + str(k) + ".svg","w") + outfile.write(s.getXML()) + outfile.close() + #Create svgs for multiple gene cluster alignment + cresults = clusterblastresults(i,hitclusters,queryclusterdata,colorschemedict,rgbcolorscheme) + s = cresults[0] + clusterblastpositiondata[str(i) + "_all"] = cresults[1] + outfile = open(svgfolder + "clusterblast" + str(i) + "_all.svg","w") + outfile.write(s.getXML()) + outfile.close() + + #Create folder for SEARCHGTR HTML files, load search form template + formtemplate = open("search_form.html","r") + formtemplate = formtemplate.read() + formtemplate = formtemplate.replace("\r","\n") + formtemplateparts = formtemplate.split("FASTASEQUENCE") + #Create HTML file with gene cluster info in hidden div tags + htmlfile = open("empty.xhtml","r") + html = htmlfile.read() + html = html.replace("\r","\n") + htmlparts = html.split("") + htmloutfile = open(genomename + "/display.xhtml","w") + htmloutfile.write(htmlparts[0]) + #Add lines toreload all svgs up front + for qclusternr in geneclusters: + htmloutfile.write(' loadsvg(' + str(qclusternr) + ');\n') + if clusterblast == "y": + cblastclusters = [1,2,3,4,5,6,7,8,9,10] + for qclusternr in geneclusters: + nrhitclusters = queryclusterdata[qclusternr][0] + for j in range(nrhitclusters): + htmloutfile.write(' loadcblastsvg(' + str(qclusternr) + ',' + str(j+1) + ');\n') + #For each gene cluster, add hidden div tags for gene names, add hidden div tags for NRPS/PKS domains, add hidden div tags for ClusterBLAST depictions + htmloutfile.write(htmlparts[1]) + for qclusternr in geneclusters: + data = qgeneclusterdata[qclusternr] + pksnrpsprots = data[7] + pksnrpsprotsnames = data[8] + pksnrpsdomains = data[9] + a = 0 + for i in pksnrpsprots: + for j in pksnrpsdomains[i][0]: + htmloutfile.write(' $("#b' + str(qclusternr) + '_00' + str(a) + '_div").hide();\n') + a += 1 + htmloutfile.write(htmlparts[2]) + #Add top menu + gifdict = {"t1pks":"16","t2pks":"17","t3pks":"18","t4pks":"20","nrps":"10","amglyccycl":"1","bcin":"2","blactam":"3","butyrolactone":"4","ectoine":"5","terpene":"19","indole":"7","lant":"8","melanin":"9","nucleoside":"12","other":"13","phosphoglycolipid":"14","siderophore":"15"} + htmloutfile.write('\n') + menubutton_nr = 1 + nrclustercolumns = 1 + for i in geneclusters: + if qgeneclusterdata[i][0] in gifdict.keys(): + typenr = gifdict[qgeneclusterdata[i][0]] + elif "-" in qgeneclusterdata[i][0]: + typenr = "6" + else: + typenr = "13" + htmloutfile.write('\n') + if menubutton_nr == 22 or menubutton_nr == 49: + htmloutfile.write('
') + nrclustercolumns += 1 + menubutton_nr += 1 + + #Add gene cluster description + htmloutfile.write(htmlparts[3]) + extrapixelsdict = {} + for qclusternr in geneclusters: + data = qgeneclusterdata[qclusternr] + clustertype = data[0] + clustersize = data[1] + genes = data[2] + annotations = data[3] + starts = data[4] + ends = data[5] + strands = data[6] + pksnrpsprots = data[7] + pksnrpsprotsnames = data[8] + pksnrpsdomains = data[9] + substrspecnrpspredictordict = data[10] + substrspecminowadict = data[11] + substrspecpkssigdict = data[12] + substrspecconsensusdict = data[13] + gtrs = data[14] + transporters = data[15] + colors = data[16] + hitgeneclusterdata = data[17] + structpred = data[18] + krpredictionsdict = data[19] + relpositions = relativepositions(starts,ends,clustersize) + rel_starts = relpositions[0] + rel_ends = relpositions[1] + #Create genes overview pop-up HTMLs + genepopupoutfile = open(htmlfolder + "geneclustergenes" + str(qclusternr) + '.html',"w") + genepopupoutfile.write('\n\n\n\n\nOverview of gene cluster genes:

\n') + genepopupoutfile.write('\n') + for i in genes: + genepopupoutfile.write('\n') + genepopupoutfile.write('\n
Gene Annotation Start position End position Strand
' + i + ' ' + annotations[i].replace("_"," ") + ' ' + str(starts[genes.index(i)]) + ' ' + str(ends[genes.index(i)]) + ' ' + strands[genes.index(i)] + '

Biosynthetic gene cluster signature gene domains detected:

\n') + genepopupoutfile.write('\n') + for i in genes: + if i in allcoregenes: + detected_doms = detecteddomainsdict[i] + for j in detected_doms: + genepopupoutfile.write('\n') + genepopupoutfile.write('\n
Gene Detected domains Bit scores
' + i + ' ' + str(j[0]) + ' ' + str(j[1]) + '

') + genepopupoutfile.write('\n\n\n') + genepopupoutfile.close() + #Add gene cluster description on top + if qclusternr == 1: + htmloutfile.write('

') + else: + htmloutfile.write('\n\n
') + #Add menu bars 1 & 2 + htmloutfile.write('
Gene cluster description
') + htmloutfile.write('
PKS/NRPS domain annotation
') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + if screenwidth < 1280: + htmloutfile.write('
\n') + else: + htmloutfile.write('
\n') + htmloutfile.write("
Gene Cluster " + str(qclusternr) + ". Type = " + clustertype + ". Location: "+ str(starts[0]) + " - " + str(ends[-1]) + " nt. Click on genes for more information.") + if len(genomic_accnr) > 4: + htmloutfile.write('  GBK') + #Genes overview pop-up. + if len(clustertype) > 20: + htmloutfile.write('
') + htmloutfile.write('    Genes and detection info overview') + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + if nrclustercolumns > 1: + spacers = nrclustercolumns - 1 + for i in range(spacers): + htmloutfile.write('\n') + htmloutfile.write('
\n') + #Add gene pop-ups + a = 0 + for i in genes: + htmloutfile.write('
\n') + htmloutfile.write(annotations[i].replace("_"," ").replace("&","&") + "\n") + if smcogs == "y": + if smcogdict.has_key(i): + smcog = smcogdict[i] + htmloutfile.write("
smCOG: " + smcog + " (" + smcogdescriptions[smcog].replace("_"," ").replace("&","&") + ")\n") + if smcog in gtrcoglist: + formfileloc = searchgtrfolder + i + ".html" + formfile = open(formfileloc,"w") + specificformtemplate = formtemplateparts[0].replace("GlycTr",i) + formfile.write(specificformtemplate) + formfile.write(i + "\n" + seqdict[i]) + formfile.write(formtemplateparts[1]) + formfile.close() + htmloutfile.write("
Run SEARCHGTr on this gene \n") + if smcog in transportercoglist: + link = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;program=blastp;sequence=sequence%0A" + seqdict[i] + htmloutfile.write("
TransportDB BLAST on this gene \n") + else: + htmloutfile.write("
smCOG: -\n") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + seqdict[i] + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(starts[a]) + "-" + str(ends[a]) + "\n") + htmloutfile.write("
NCBI BlastP on this gene
\n") + browse_start = starts[a] - 10000 + browse_end = ends[a] + 10000 + if browse_start < 0: + browse_start = 0 + if browse_end > dnaseqlength: + browse_end = dnaseqlength + if genomic_accnr != "none" and genomic_accnr != "": + htmloutfile.write('View genomic context
\n') + if smcogs == "y": + if smcogtreedict.has_key(i.rpartition(".")[0]): + htmloutfile.write('View smCOG seed phylogenetic tree with this gene\n') + elif smcogtreedict.has_key(i): + htmloutfile.write('View smCOG seed phylogenetic tree with this gene\n') + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + htmloutfile.write(i) + htmloutfile.write("
\n\n") + a += 1 + #Early calculation of nr of domains to be able to fit structure prediction information of large NRPSs/PKSs + pksnrpsdomainnr = 0 + krdomainnr = 0 + adomainnr = 0 + for i in pksnrpsprots: + doms = pksnrpsdomains[i][0] + first = "no" + nra = 0 + nrat = 0 + nrkr = 0 + nrcal = 0 + for j in doms: + if "AMP-binding" in j or "A-OX" in j: + j = "A" + nra += 1 + adomainnr += 1 + z = nra + if "KR" in j: + j = "KR" + nrkr += 1 + krdomainnr += 1 + z = nrkr + if "AT" in j and "docking" not in j: + j = "AT" + nrat += 1 + pksnrpsdomainnr += 1 + z = nrat + if "CAL" in j: + j = "CAL" + nrcal += 1 + pksnrpsdomainnr += 1 + z = nrcal + pixels = adomainnr * 50 + pksnrpsdomainnr * 40 + krdomainnr * 30 + (len(pksnrpsprots) * 16) + 375 + extrapixels = pixels - (676 + len(pksnrpsprots) * 99) + if extrapixels < 0: + extrapixels = 0 + extrapixelsdict[qclusternr] = extrapixels + #Add picture of predicted chemical structure + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + if screenwidth < 1280: + htmloutfile.write('
Predicted core structure
\n') + else: + htmloutfile.write('
Predicted core structure
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + if qclusternr in failedstructures: + htmloutfile.write('

\n') + elif " " in structpred: + htmloutfile.write('

\n') + else: + htmloutfile.write('

\n') + htmloutfile.write('
\n') + htmloutfile.write("Monomers prediction: " + structpred + "
\n") + if qclusternr in dockingdomainanalysis: + htmloutfile.write('Docking domain analysis results.
\n') + nrpsfound = "no" + pksnrpsdomainnr = 0 + adomainnr = 0 + krdomainnr = 0 + for i in pksnrpsprots: + doms = pksnrpsdomains[i][0] + first = "no" + nra = 0 + nrat = 0 + nrkr = 0 + nrcal = 0 + for j in doms: + if "AMP-binding" in j or "A-OX" in j: + j = "A" + nra += 1 + adomainnr += 1 + z = nra + if "KR" in j: + j = "KR" + nrkr += 1 + krdomainnr += 1 + z = nrkr + if "AT" in j and "docking" not in j: + j = "AT" + nrat += 1 + pksnrpsdomainnr += 1 + z = nrat + if "CAL" in j: + j = "CAL" + nrcal += 1 + pksnrpsdomainnr += 1 + z = nrcal + prediction = "no" + domname = str(i) + "_" + str(j) + str(z) + if domname in substrspecnrpspredictordict.keys(): + nrpsfound = "yes" + prediction = "yes" + if substrspecnrpspredictordict[domname][0] == "nrp": + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  NRPSPredictor code prediction, '+ str(j) + str(z) + ': ?
\n') + else: + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  NRPSPredictor code prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][0] + '
\n') + if substrspecnrpspredictordict[domname][1] == "nrp": + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ?
\n') + else: + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  NRPSPredictor SVM prediction, '+ str(j) + str(z) + ': ' + substrspecnrpspredictordict[domname][1] + '
\n') + if domname in substrspecminowadict.keys(): + prediction = "yes" + if substrspecminowadict[domname] == "nrp" or substrspecminowadict[domname] == "pk": + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  Minowa prediction, '+ str(j) + str(z) + ': ?
\n') + else: + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  Minowa prediction, '+ str(j) + str(z) + ': ' + substrspecminowadict[domname] + '
\n') + if domname in substrspecpkssigdict.keys(): + prediction = "yes" + if substrspecpkssigdict[domname] == "pk": + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  PKS code prediction, '+ str(j) + str(z) + ': ?
\n') + else: + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  PKS code prediction, '+ str(j) + str(z) + ': ' + substrspecpkssigdict[domname] + '
\n') + if domname in krpredictionsdict.keys(): + if first == "no": + first = "yes" + htmloutfile.write(i + ':
') + htmloutfile.write('  KR activity, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][0] + "
\n") + htmloutfile.write('  KR stereochemistry, '+ str(j) + str(z) + ': ' + krpredictionsdict[domname][1] + "
\n") + #Add link to prediction details pop-up + if prediction == "yes": + htmloutfile.write('    Prediction details
\n') + if nrpsfound == "yes": + htmloutfile.write('
Perform Norine peptide search') + htmloutfile.write('
') + if screenwidth < 1280: + htmloutfile.write('
File outputs
\n') + else: + htmloutfile.write('
Downloadable output files
\n') + htmloutfile.write('
\n') + htmloutfile.write('
\n') + htmloutfile.write('
') + if fullhmm == "y" or fullblast == "y": + htmloutfile.write('Open EMBL summary file

') + #htmloutfile.write('Download CSV summary file

') + if fullhmm == "y": + htmloutfile.write('Sec. met. enriched genome regions

') + htmloutfile.write('Open XLS overview table

') + htmloutfile.write('
') + htmloutfile.write("
\n\n") + #Add descriptions of NRPS/PKS genes + htmloutfile.write('
\n') + htmloutfile.write("
\n\n") + z = 1 + for i in pksnrpsprots: + htmloutfile.write('
\n') + htmloutfile.write(i + " (" + pksnrpsprotsnames[i].lower() + ")") + htmloutfile.write("
\n\n") + z += 1 + #Add NRPS/PKS domain pop-ups + longestprot = 0 + protlengthdict = {} + for i in pksnrpsprots: + protlength = (geneposdict[i][1] - geneposdict[i][0]) / 3 + protlengthdict[i] = protlength + if protlength > longestprot: + longestprot = protlength + try: + aa2pixelratio = longestprot * 0.75 / screenwidth + except: + aa2pixelratio = 0.1 + a = 0 + z = 1 + for i in pksnrpsprots: + domainsdict = pksnrpsdomains[i][1] + nra = 0 + nrat = 0 + nrkr = 0 + nrcal = 0 + for j in pksnrpsdomains[i][0]: + startpos = domainsdict[j][0] + endpos = domainsdict[j][1] + htmloutfile.write('
\n') + htmloutfile.write("Domain " + j + " (" + i + ")") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + seqdict[i][startpos:endpos] + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(startpos) + "-" + str(endpos) + " AA\n") + domid = i + "_" + j + if "AMP-binding" in j or "A-OX" in j: + j = "A" + nra += 1 + y = nra + if "PKS_KR" in j: + j = "KR" + nrkr += 1 + y = nrkr + if "PKS_AT" in j: + j = "AT" + nrat += 1 + y = nrat + if "CAL_domain" in j: + j = "CAL" + nrcal += 1 + y = nrcal + prediction = "no" + domid = str(i) + "_" + str(j) + str(y) + if substrspecnrpspredictordict.has_key(domid) or substrspecminowadict.has_key(domid) or substrspecpkssigdict.has_key(domid): + htmloutfile.write("
Predicted substrate: " + substrspecconsensusdict[domid] + "\n") + if substrspecnrpspredictordict.has_key(domid): + htmloutfile.write("
-NRPSPredictor code: " + substrspecnrpspredictordict[domid][0] + "\n") + htmloutfile.write("
-NRPSPredictor SVM: " + substrspecnrpspredictordict[domid][1] + "\n") + if substrspecminowadict.has_key(domid): + htmloutfile.write("
-Minowa HMM: " + substrspecminowadict[domid] + "\n") + if substrspecpkssigdict.has_key(domid): + htmloutfile.write("
-PKS code: " + substrspecpkssigdict[domid] + "\n") + if krpredictionsdict.has_key(domid): + htmloutfile.write("
KR activity: " + krpredictionsdict[domid][0] + "\n") + htmloutfile.write("
KR stereochemistry: " + krpredictionsdict[domid][1] + "\n") + htmloutfile.write("
NCBI BlastP on this domain \n") + htmloutfile.write("
\n\n") + a += 1 + z += 1 + htmloutfile.write('
\n') + + if clusterblast == "y": + #Write ClusterBlast divs with pictures and description pop-up tags + htmloutfile.write('
\n\n') + #Add menu bar 3 + htmloutfile.write('
Homologous gene clusters
') + htmloutfile.write('
') + htmloutfile.write('
') + for qclusternr in geneclusters: + nrhitclusters = queryclusterdata[qclusternr][0] + hitclusterdata = queryclusterdata[qclusternr][1] + if qclusternr == 1: + htmloutfile.write('
\n

\n
\n
\n\n') + htmloutfile.write('\n') + for i in range(nrhitclusters): + htmloutfile.write('\n') + htmloutfile.write('\n
\n\n
') + htmloutfile.write('
') + clustersizes = [] + for i in range(nrhitclusters): + hitclusterdata = queryclusterdata[qclusternr][1] + queryclustergenes = hitclusterdata[1][3] + queryclustergenesdetails = hitclusterdata[1][4] + hitclusternumber = i + 1 + cluster_acc = hitclusterdata[hitclusternumber][6] + hitclustergenes = hitclusterdata[hitclusternumber][1] + hitclustergenesdetails = hitclusterdata[hitclusternumber][2] + relpositiondata = clusterblastpositiondata[str(qclusternr) + "_" + str(i+1)] + qrel_starts = relpositiondata[0][0] + qrel_ends = relpositiondata[0][1] + hrel_starts = relpositiondata[1][hitclusternumber ][0] + hrel_ends = relpositiondata[1][hitclusternumber ][1] + strandsbalance = relpositiondata[2][hitclusternumber] + if strandsbalance < 0: + hitclustergenes.reverse() + if qclusternr == 1 and (i+1) == 1: + htmloutfile.write('
\n') + else: + htmloutfile.write('
\n') + #Insert gene cluster descriptions + cdescription = hitclusterdata[i+1][5][i].replace("&","&").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0] + if len(nucname) < 80: + qdescription = nucname + else: + qdescription = nucname[0:77] + "..." + htmloutfile.write('
' + qdescription + '
\n') + htmloutfile.write('
' + cdescription + '
\n') + #Insert pubmed/pubchem links + htmloutfile.write('
Hit cluster cross-links: \n') + htmloutfile.write('  \n') + present = "n" + for j in pubmed_dict.keys(): + if j in cluster_acc: + present = "y" + for j in pubchem_dict.keys(): + if j in cluster_acc: + present = "y" + if present == "y": + for j in pubmed_dict.keys(): + if j in cluster_acc: + pubmedstring = pubmed_dict[j] + htmloutfile.write('  \n') + for j in pubchem_dict.keys(): + if j in cluster_acc: + pubchemstring = pubchem_dict[j] + if "," in pubchemstring: + htmloutfile.write('  \n') + else: + htmloutfile.write('  \n') + htmloutfile.write('
\n\n') + #Create gene pop-ups + a = 0 + for j in queryclustergenes: + j_accession = accessiondict[j] + htmloutfile.write('
\n') + htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j_accession + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n") + htmloutfile.write("
NCBI BlastP on this gene \n") + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + htmloutfile.write(j) + htmloutfile.write("
\n\n") + a+= 1 + a = 0 + for j in hitclustergenes: + j_accession = hitclustergenesdetails[j][4] + htmloutfile.write('
\n') + htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j_accession + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n") + htmloutfile.write("
NCBI BlastP on this gene \n") + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + htmloutfile.write(j) + htmloutfile.write("
\n\n") + a += 1 + htmloutfile.write('
\n') + #Find new relative positions for display of all gene clusters in one picture + relpositiondata = clusterblastpositiondata[str(qclusternr) + "_all"] + qrel_starts = relpositiondata[0][0] + qrel_ends = relpositiondata[0][1] + htmloutfile.write('
\n') + if len(nucname) < 80: + qdescription = nucname + else: + qdescription = nucname[0:77] + "..." + htmloutfile.write('
' + qdescription + '
\n') + for i in range(nrhitclusters): + hitclusterdata = queryclusterdata[qclusternr][1] + queryclustergenes = hitclusterdata[1][3] + queryclustergenesdetails = hitclusterdata[1][4] + hitclusternumber = i + 1 + hrel_starts = relpositiondata[1][hitclusternumber][0] + hrel_ends = relpositiondata[1][hitclusternumber][1] + cluster_acc = hitclusterdata[hitclusternumber][6] + hitclustergenes = hitclusterdata[hitclusternumber][1] + hitclustergenesdetails = hitclusterdata[hitclusternumber][2] + strandsbalance = relpositiondata[2][hitclusternumber] + cdescription = hitclusterdata[i+1][5][i].replace("&","&").replace("\t"," ").partition(" ")[2].partition(" ")[2].split(", whole")[0].split(", complete")[0] + htmloutfile.write('
' + cdescription + '
\n') + if hitclusternumber == 1: + a = 0 + for j in queryclustergenes: + htmloutfile.write('
\n') + htmloutfile.write(queryclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(queryclustergenesdetails[j][0]) + "-" + str(queryclustergenesdetails[j][1]) + "\n") + htmloutfile.write("
NCBI BlastP on this gene \n") + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + htmloutfile.write(j) + htmloutfile.write("
\n\n") + a+= 1 + a = 0 + for j in hitclustergenes: + htmloutfile.write('
\n') + htmloutfile.write(hitclustergenesdetails[j][3].replace("_"," ").replace("&","&") + "\n") + link = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=" + j + "&LINK_LOC=protein&PAGE_TYPE=BlastSearch" + htmloutfile.write("
Location: " + str(hitclustergenesdetails[j][0]) + "-" + str(hitclustergenesdetails[j][1]) + "\n") + htmloutfile.write("
NCBI BlastP on this gene \n") + htmloutfile.write("
\n\n") + htmloutfile.write('
\n') + htmloutfile.write(j) + htmloutfile.write("
\n\n") + a += 1 + htmloutfile.write('
\n') + htmloutfile.write('
\n\n') + if clusterblast == "y": + htmloutfile.write('
\n') + for i in geneclusters: + data = qgeneclusterdata[i] + extrapixels = extrapixelsdict[i] + pksnrpsprots = data[7] + if i == 1: + htmloutfile.write('
') + else: + htmloutfile.write('
') + htmloutfile.write('
\n
\n    \n    \n    \n    \n
\n
\nantiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters.\n
Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, Rainer Breitling & Eriko Takano\n
Nucleic Acids Research (2011), proposal submitted.\n
\n
\n
') + #Add final part of HTML file + htmloutfile.write(htmlparts[-1]) + #Copy accessory files for HTML viewing + if sys.platform == ('win32'): + copycommand1 = "copy/y vis\\* " + genomename + " > nul" + copycommand2 = "copy/y vis\\html\\* " + genomename + "\\html > nul" + copycommand3 = "copy/y vis\\images\\* " + genomename + "\\images > nul" + elif sys.platform == ('linux2'): + copycommand1 = "cp -r vis/* " + genomename + " > /dev/null" + copycommand2 = "true" + copycommand3 = "true" + os.system(copycommand1) + os.system(copycommand2) + os.system(copycommand3) + + #Generate EMBL output + emblfile = open(genomename + "/embl_lines.txt","w") + for i in geneclustergenes: + emblfile.write(i + "\t") + if smcogs == "y": + if smcogdict.has_key(i): + emblfile.write("smCOG: " + smcogdict[i] + ":" + smcogdescriptions[smcogdict[i]] + "\t") + if nrpspkstypedict.has_key(i): + emblfile.write("NRPS/PKS type: " + nrpspkstypedict[i] + "\t") + if domaindict.has_key(i): + domains = domaindict[i] + for j in domains: + emblfile.write(j[0] + " (" + str(j[1]) + "-" + str(j[2]) + "); E-value:" + str(j[3]) + "; Bit score: " + str(j[4]) + "\t") + nrat = 0 + for k in minowa_pks_preds.keys(): + if i in k: + nrat += 1 + emblfile.write("AT-domain " + str(nrat) + " Minowa substrate specificity prediction: " + minowa_pks_preds[k] + "\t") + nrat = 0 + for k in pks_code_preds.keys(): + if i in k: + nrat += 1 + emblfile.write("AT-domain " + str(nrat) + " PKS code substrate specificity prediction: " + pks_code_preds[k] + "\t") + nrcal = 0 + for k in minowa_cal_preds.keys(): + if i in k: + nrcal += 1 + emblfile.write("CAL-domain " + str(nrcal) + " Minowa substrate specificity prediction: " + minowa_cal_preds[k] + "\t") + nra = 0 + for k in minowa_nrps_preds.keys(): + if i in k: + nra += 1 + emblfile.write("A-domain " + str(nra) + " Minowa substrate specificity prediction: " + minowa_nrps_preds[k] + "\t") + nra = 0 + for k in nrps_code_preds.keys(): + if i in k: + nra += 1 + emblfile.write("A-domain " + str(nra) + " Stachelhaus code substrate specificity prediction: " + nrps_code_preds[k] + "\t") + nra = 0 + for k in nrps_svm_preds.keys(): + if i in k: + nra += 1 + emblfile.write("A-domain " + str(nra) + " NRPSPredictor2 SVM substrate specificity prediction: " + nrps_svm_preds[k] + "\t") + nrkr = 0 + for k in kr_activity_preds.keys(): + if i in k: + nrkr += 1 + emblfile.write("KR-domain " + str(nrat) + " activity prediction: " + kr_activity_preds[k] + "\t") + emblfile.write("KR-domain " + str(nrat) + " predicted stereochemistry group: " + kr_stereo_preds[k] + "\t") + if motifdict.has_key(i): + l = motifdict[i] + for m in l: + emblfile.write("Motif " + str(m[0]) + " (" + str(m[1]) + "-" + str(m[2]) + "). E-value: " + str(m[3]) + "; Bit score: " + str(m[4]) + "\t") + emblfile.write("\n") + emblfile.write("\n\n>>\n\n") + #enter separate domain entries + for i in geneclustergenes: + strand = strandsdict[i] + startpos = geneposdict[i][0] + endpos = geneposdict[i][1] + if domaindict.has_key(i): + domains = domaindict[i] + for j in domains: + if strand == "+": + emblfile.write("misc_feature\t" + str(startpos + j[1] * 3) + ".." + str(startpos + j[2] * 3) + "\t" + str(j[0]) + " domain;\tE-value: " + str(j[3]) + "\tBit score: " + str(j[4]) + "\t/colour=2\n") + elif strand == "-": + emblfile.write("misc_feature\tcomplement(" + str(endpos - j[2] * 3) + ".." + str(endpos - j[1] * 3) + ")\t" + str(j[0]) + "domain;\tE-value: " + str(j[3]) + "Bit score: " + str(j[4]) + "\t/colour=2\n") + if motifdict.has_key(i): + l = motifdict[i] + for m in l: + if strand == "+": + emblfile.write("misc_feature\t" + str(startpos + m[1] * 3) + ".." + str(startpos + m[2] * 3) + "\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n") + elif strand == "-": + emblfile.write("misc_feature\tcomplement(" + str(endpos - m[2] * 3) + ".." + str(endpos - m[1] * 3) + ")\t" + str(m[0]) + " motif;\tE-value: " + str(m[3]) + "\tBit score: " + str(m[4]) + "\t/colour=6\n") + emblfile.write("\n\n>>\n\n") + for i in geneclusters: + cstart = clusterinfo[i][1] + if cstart == 0: + cstart = 1 + cend = clusterinfo[i][2] + emblfile.write("misc_feature\t" + str(cstart) + ".." + str(cend) + "\t" + clusterinfo[i][0] + " gene cluster\t/colour=13\n") + emblfile.close() + + #Close open html file + htmloutfile.close() + + #Run whole-genome BLAST / HMM CLUSEAN modules & ClusterFinder + if sys.platform == ('win32'): + copycommand = "copy " + infile + " " + genomename + ' > nul' + if sys.platform == ('linux2'): + copycommand = "cp " + infile + " " + genomename + os.system(copycommand) + os.chdir(genomename) + args = "--cpus %s " % nrcpus + if fullblast == "n": + args += "--without-blast " + if fullhmm == "n": + args += "--without-hmmer " + if fullhmm == "y": + args += '--pfamdbpath %s ' % pfamdbpath + if fullblast == "y": + args += '--blastdbpath %s ' % blastdbpath + logfile.write("Running CLUSEAN pipeline modules.\n") + if sys.platform == ('win32'): + os.system("python ..\\clusean\\scripts\\runPipeline.py %s" % args) + if sys.platform == ('linux2'): + os.system( antismash_path + "clusean/scripts/runPipeline.py %s" % args) + #print antismash_path + "clusean/scripts/runPipeline.py %s" % args + + os.chdir('..') + + #Close log file + logfile.write("antiSMASH successfully finished in " + str(elapsed) + " seconds.\n") + #print "antiSMASH successfully finished in " + str(elapsed) + " seconds.\n" + logfile.close() diff -r 000000000000 -r 6a37d0a4510a multi_antiSMASH.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_antiSMASH.xml Thu Mar 15 05:23:03 2012 -0400 @@ -0,0 +1,119 @@ + + and Antibiotics Analysis (multi antiSMASH) + multi_antiSMASH_wrapper.py +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +## ./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullblast y --fullhmm y + + #if $input_opts.input_opts_selector == "fasta": + --input $infile + --glimmer_prediction $glimmer_prediction + #else: + --input $infile + #end if + + --input_format $input_opts.input_opts_selector + + $eukaryotic + $genecluster + $smCOG + $wg_blast + --geneclustertypes $geneclustertypes + --geneclusterprots $geneclusterprots + --geneclusterprots_gff $geneclusterprots_gff + --embl_path $embl + > /dev/null 2> /dev/null + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +That version of antiSMASH can only handle multiple sequence files. +It will run antiSMASH automatically on each sequence and aggregate the most important information from all runs. +It will not provide an interactive HTML output file. If you need that please use the normal antiSMASH version on a single sequence. + + +.. class:: warningmark + +**Warning on FASTA Header** + +The FASTA header should not contain any special letters, like '|'. Please replace them with the 'sed-tool'. + + +**What it does** + +antiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes. +It integrates and cross-links with a large number of in silico secondary metabolite analysis tools that have been published earlier. + + +**Input** + +If you don't have an annotated GenBank or embl file you also can provide a glimmer prediction output. Created with glimmer or glimmerHMM. + + +**References** + +Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, +Rainer Breitling and Eriko Takano (2011). antiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters. Nucleic Acids Research, doi: 10.1093/nar/gkr466. + +http://antismash.secondarymetabolites.org/help.html + + + diff -r 000000000000 -r 6a37d0a4510a multi_antiSMASH_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_antiSMASH_wrapper.py Thu Mar 15 05:23:03 2012 -0400 @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import os, sys, subprocess, commands +import random, shutil +import zipfile +from Bio import SeqIO +import tempfile + +blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py' + + +def anitSMASH(args, sequence_path, glimmer_path): + #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y + print sequence_path + rint = random.randint(1,10000000) + tmp_dir = '/tmp/galaxy_%s' % rint + os.mkdir(tmp_dir) + os.mkdir(os.path.join( tmp_dir, 'geneprediction' )) + os.chdir(tmp_dir) + print 'IFORMAT:',args.iformat + if args.iformat == 'fasta': + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta') + elif args.iformat == 'genbank': + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk') + else: + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl') + + + # try to generate the same name as in antismash.py + genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] ) + for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""": + genomename = genomename.replace(i,"") + result_path = os.path.join( tmp_dir, genomename ) + + shutil.copy(sequence_path, new_input_path ) + + if args.eukaryotic: + taxon = '--taxon e' + else: + taxon = '--taxon p' + + if args.clusterblast: + clusterblast = '--clusterblast y' + else: + clusterblast = '--clusterblast n' + + if args.smcogs: + smcogs = '--smcogs y' + else: + smcogs = '--smcogs n' + + if args.fullhmm: + fullhmm = '--fullhmm y' + else: + fullhmm = '--fullhmm n' + + if args.fullblast: + fullblast = '--fullblast y' + else: + fullblast = '--fullblast n' + + h = [antismash_path, new_input_path, + '--geneclustertypes %s' % args.geneclustertypes, + taxon, + clusterblast, + smcogs, + fullhmm, + fullblast, + '--blastdbpath %s' % blastdbpath, + '--pfamdbpath %s' % pfamdbpath, + '--cores 10', + ] + if args.iformat == 'fasta': + h.append('--glimmer_prediction %s' % glimmer_path) + + a = ' '.join(h) + + subprocess.call(a, shell=True) + print a + embl_path = os.path.join(result_path, '%s.final.embl' % genomename) + if os.path.exists(embl_path) and args.embl_path: + args.embl_path.write( open(embl_path).read() ) + + gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s" + + clustername_mapping = {} + genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt') + if os.path.exists(genecluster_path): + for line in open( genecluster_path): + token = line.split('\t') + clustername_mapping[token[2]] = token[3] + old_cluster_name = False + parent_cluster_name = "" + child_lines = "" + starts = [] + ends = [] + for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ): + if line.startswith('>'): + columns = line[1:].strip().split('|') + args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) ) + + """ + "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp". + """ + if old_cluster_name == False: + # inital clustername setting + # try to catch the line in which the name changes + old_cluster_name = columns[1] + parent_cluster_name = columns[1] + + seqname = columns[0] + strand = columns[3] + (start, end) = columns[2].split('-') + starts.append(int(start)) + ends.append(int(end)) + + if old_cluster_name != columns[1]: + # begin of a new cluster, caclulate the parent line for the previous clusterproteins + # TODO: maybe the assumption is not correct, to take the smallest start and the largest end + starts.sort() + ends.sort() + genecluster_start = starts[0] + genecluster_end = ends[-1] + group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) + oline = gff_line % ( + seqname, 'genecluster', genecluster_start, genecluster_end, strand, group + ) + args.geneclusterprots_gff.write(oline + child_lines) + starts = [] + ends = [] + old_cluster_name = columns[1] + parent_cluster_name = columns[1] + child_lines = '' + + group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5]) + child_lines += gff_line % ( + seqname, 'gene', start, end, strand, group + ) + + else: + args.geneclusterprots.write( line ) + + starts.sort() + ends.sort() + genecluster_start = starts[0] + genecluster_end = ends[-1] + group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) + oline = gff_line % ( + seqname, 'genecluster', genecluster_start, genecluster_end, strand, group + ) + args.geneclusterprots_gff.write(oline + child_lines) + + # remove tmp directory + shutil.rmtree(tmp_dir) + + +def arg_parse(): + import argparse + parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper') + parser.add_argument('--version', action='version', version='%(prog)s 0.01') + parser.add_argument('--geneclustertypes') + parser.add_argument('--clusterblast', action='store_true') + parser.add_argument('--eukaryotic', action='store_true') + parser.add_argument('--fullhmm', action='store_true') + parser.add_argument('--smcogs', action='store_true') + parser.add_argument('--fullblast', action='store_true') + + parser.add_argument('--input', '-i', help='FASTA Sequence File') + parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File') + + parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input') + + parser.add_argument('--zip', help='output: all files as zip file') + parser.add_argument('--html_file', help='output: the path to the index html file') + parser.add_argument('--html_path', help='output: the path to the output html dir') + parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w')) + parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w')) + parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w')) + + + args = parser.parse_args() + return args + + +def extract_glimmerHMM_results(sequence_id, glimmerHMM): + """ + Returns the given annoations from a sequence_id as complete GFF3 formated string. + """ + result = "##gff-version 3\n" + found_annoations = False + for line in open(glimmerHMM): + if line.startswith(sequence_id + '\t'): + result += line + found_annoations = True + if found_annoations: + return result + else: + return False + +def extract_glimmer_results(sequence_id, glimmer): + """ + Returns the given annoations from a sequence_id as glimmer result file. + """ + result = '' + found_annoations = False + for line in open(glimmer): + # that construct matches the pseudo fasta header and the corresponding orfs + if line.split()[0].startswith(sequence_id): + result += line + found_annoations = True + if found_annoations: + return result + else: + return False + + +if __name__ == '__main__': + args = arg_parse() + geneclusterprots = "" + temp_dir = tempfile.mkdtemp() + args.geneclusterprots_gff.write("##gff-version 3\n") + + + + for record in SeqIO.parse(open(args.input, "rU"), args.iformat) : + # create two tem files and fill it with the information from one sequence + record.id = record.id or record.name + tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file + SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat) + + tmp_glimmer = '' + if args.iformat == 'fasta': + # We need to annotate the sequences manually + if args.eukaryotic: + #print os.path.join(temp_dir, record.id + '.gff') + tmp_glimmer = os.path.join(temp_dir, record.id + '.gff') + annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction) + #print annotations, ':', record.id, args.glimmer_prediction + if not annotations: + continue + open(tmp_glimmer, 'w+').write( annotations ) + else: + tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv') + annotations = extract_glimmer_results(record.id, args.glimmer_prediction) + if not annotations: + continue + open(tmp_glimmer, 'w+').write( annotations ) + + anitSMASH(args, tmp_sequence, tmp_glimmer) + shutil.rmtree(temp_dir) + + + + + + + + + + + + diff -r 000000000000 -r 6a37d0a4510a readme.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.txt Thu Mar 15 05:23:03 2012 -0400 @@ -0,0 +1,71 @@ +Galaxy wrapper for AntiSmash +===================================== + +This wrapper is copyright 2012 by Björn Grüning. + +This is a wrapper for the command line tool of antiSMASH. + +antiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes. +It integrates and cross-links with a large number of in silico secondary metabolite analysis tools. + +http://antismash.secondarymetabolites.org/ + +Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, Rainer Breitling & Eriko Takano (2011). +antiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters. Nucleic Acids Research 39: W339-W346. + + +Installation +============ + +Currently these wrapper is tested with version 1.1 and the modified version of antismash.py included in that repository. + +Install or downlaod antiSMASH from: + +http://antismash.secondarymetabolites.org/download.html + +... and follow the instructions. +Please replace the antismash.py file with the one inlcuded in that repository. +Edit the following lines in multi_antiSMASH_wrapper.py and antiSMASH_wrapper.py and adopt it to your installation. + +blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py' + + +To install the wrapper copy the antiSMASH folder in the galaxy tools +folder and modify the tools_conf.xml file to make the tool available to Galaxy. +For example: + +
+ + +
+ + +History +======= + +v0.1 - Initial public release + + +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. + diff -r 000000000000 -r 6a37d0a4510a tool_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml Thu Mar 15 05:23:03 2012 -0400 @@ -0,0 +1,7 @@ + + +
+ + +
+

Gene order	Score
" + for m in l: + string = string + m + "," + if startergene != "" and endinggene != "": + string = string[:-1] + "	" + str(possible_orders_scoredict[tuple(l[1:-1])]) + elif startergene == "" and endinggene != "": + string = string[:-1] + "	" + str(possible_orders_scoredict[tuple(l[:-1])]) + elif startergene != "" and endinggene == "": + string = string[:-1] + "	" + str(possible_orders_scoredict[tuple(l[1:])]) + elif startergene == "" and endinggene == "": + string = string[:-1] + "	" + str(possible_orders_scoredict[tuple(l)]) + dockhtmlfile.write(string + "

Gene	Annotation	Start position	End position	Strand
' + i + '	' + annotations[i].replace("_"," ") + '	' + str(starts[genes.index(i)]) + '	' + str(ends[genes.index(i)]) + '	' + strands[genes.index(i)] + '

Gene	Detected domains	Bit scores
' + i + '	' + str(j[0]) + '	' + str(j[1]) + '